def __init__(self, n_tokens, n_cells, db, emb, max_gen=10): self.n_tokens = n_tokens self.n_cells = n_cells self.max_gen = max_gen self.db = db self.emb = emb emb_dim = emb.size() self.input_rnn = LSTM(n_in=emb_dim, n_out=n_cells) self.output_rnn = LSTM(n_in=emb_dim, n_out=n_cells) self.output_rnn_clf = Sequential( [LinearLayer(n_in=n_cells, n_out=n_tokens), Softmax()]) self.output_switch_p = Sequential( [LinearLayer(n_in=n_cells, n_out=1), Sigmoid()]) self.att = Attention(n_hidden=n_cells) self.param_layers, self.param_layers_names = zip(*[ (self.output_switch_p, 'switch'), (self.output_rnn_clf, 'out_rnn_clf'), (self.output_rnn, 'out_rnn'), (self.att, 'att'), (self.input_rnn, 'in_rnn'), ]) self.print_widths = defaultdict(dict) self.parametrize_from_layers(self.param_layers, self.param_layers_names)
def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer(n_d=self.n_d, vocab=set(w for w in train)) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format(self.n_V, self.n_d)) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM(n_in=self.n_d, n_out=self.n_d, activation=activation) output_layer = Layer( n_in=self.n_d, n_out=self.n_V, activation=T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape((self.idxs.shape[0], self.idxs.shape[1], self.n_d)) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:, :, self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [embedding_layer, rnn_layer, output_layer] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum( len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def ready(self): args = self.args embedding_layer = self.embedding_layer num_aspects = self.num_aspects self.n_emb = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) self.x = T.imatrix('x') self.w_masks = T.fmatrix('mask') self.w_lens = T.fvector('sent_len') self.s_maxlen = T.iscalar('sent_max_len') self.s_num = T.iscalar('sent_num') self.y = T.ivector('y') self.ay = T.imatrix('ay') self.ay_mask = T.fmatrix('ay_mask') self.aay = T.itensor3('aay') x = self.x query = self.query w_masks = self.w_masks w_lens = self.w_lens s_ml = self.s_maxlen s_num = self.s_num n_emb = self.n_emb y = self.y ay = self.ay ay_mask = self.ay_mask aay = self.aay layers = self.layers = [embedding_layer] slices = embedding_layer.forward(x.ravel()) self.slices = slices = slices.reshape( (x.shape[0], x.shape[1], n_emb) ) slices_query = embedding_layer.forward(query.flatten(), is_node = False) slices_query = slices_query.reshape( (query.shape[0], query.shape[1], n_emb)) layers.append(Query_Repr_Layer(slices_query)) slices_query_tmp = slices_query = layers[-1].forward() layer = LSTM(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = layers[-1].forward_all(prev_output, w_masks) layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh) layers.append(layer) self.slices_query = slices_query = layers[-1].forward(slices_query) maskss = [] w_lenss = [] for i in range(num_aspects): maskss.append(w_masks) w_lenss.append(w_lens) maskss = T.concatenate(maskss, axis = 1) w_lenss = T.concatenate(w_lenss) layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward(prev_output, slices_query, is_word = True, hop = args.hop_word, masks = w_masks, aspect_num = num_aspects) prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2])) prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / (num_aspects * s_num), s_num, prev_output.shape[1])) prev_output = prev_output.dimshuffle(2, 0, 1, 3) prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[1] * prev_output.shape[2], prev_output.shape[3])) layer = LSTM(n_in = n_emb * args.hop_word, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward_all(prev_output) #layers.append(Query_Repr_Layer(slices_query)) #slices_query = layers[-1].forward() layer = Layer(n_in = n_emb, n_out = n_emb, activation = tanh) layers.append(layer) slices_query = layers[-1].forward(slices_query_tmp) # bug layer = IterAttentionLayer(n_in = n_emb, n_out = n_emb) layers.append(layer) prev_output = layers[-1].forward(prev_output, slices_query, is_word = False, hop = args.hop_sent, aspect_num = num_aspects) prev_output = prev_output.reshape((prev_output.shape[0] * prev_output.shape[1], prev_output.shape[2])) prev_output = apply_dropout(prev_output, dropout, v2=True) prev_output = prev_output.reshape((num_aspects, prev_output.shape[0] / num_aspects, prev_output.shape[1])) softmax_inputs = [] for i in range(num_aspects): softmax_inputs.append(prev_output[i]) size = n_emb * args.hop_sent p_y_given_a = [] pred_ay = [] nll_loss_ay = [] for i in range(num_aspects): layers.append(Layer(n_in = size, n_out = args.score_scale, activation = softmax, has_bias = False,)) p_y_given_a.append(layers[-1].forward(softmax_inputs[i])) nll_loss_ay.append( T.mean(T.sum( -T.log(p_y_given_a[-1]) * aay[:, i, :] * ay_mask[:, i].dimshuffle(0, 'x')))) pred_ay.append(T.argmax(p_y_given_a[-1], axis = 1)) self.p_y_given_a = p_y_given_a self.nll_loss_ay = T.sum(nll_loss_ay) self.pred_ay = T.stack(pred_ay).dimshuffle(1, 0) for l,i in zip(layers[4:], range(len(layers[3:]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) self.l2_sqr = None self.params = [ ] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, pos_dim, pos_lstm_dim, lr_method, lr_rate, clip_norm, crf, is_train, **kwargs): """ 建立网络 """ # 各变量的种类数 n_words = len(self.id_to_word) n_pos_tags = len(self.id_to_pos) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # 网络变量 self.word_ids = tf.placeholder( tf.int32, shape=[None, None], name='word_ids') # 词的数字索引 shape:[batch_size, max_word_len] self.word_pos_ids = tf.placeholder( tf.int32, shape=[None], name='word_pos_ids') # 词的位置索引 shape: [batch_size] self.pos_ids = tf.placeholder( tf.int32, shape=[None, None], name='pos_ids') # 词性标签的数字索引 shape:[batch_size, max_pos_len] self.char_for_ids = tf.placeholder( tf.int32, shape=[None, None, None], name='char_for_ids' ) # 字符的前向的数字索引 shape: [batch_size, word_max_len, char_max_len] self.char_rev_ids = tf.placeholder( tf.int32, shape=[None, None, None], name='char_rev_ids' ) # 字符的后向的数字索引 shape: [batch_size, word_max_len, char_max_len] self.char_pos_ids = tf.placeholder( tf.int32, shape=[None, None], name='char_pos_ids' ) # 字符的位置索引 shape: [batch_size*word_max_len, char_max_len] self.tag_ids = tf.placeholder( tf.int32, shape=[None, None], name='tag_ids') # NER标签的数字索引 shape: [batch_size,word_max_len] self.tag_id_trans = tf.placeholder( tf.int32, shape=[None, None, None], name='tag_id_trans' ) # NER标签的转移矩阵的索引 shape: [batch_size,word_max_len+1,2] self.tag_id_index = tf.placeholder( tf.int32, shape=[None, None, None], name='tag_id_index') # shape: [batch_size,word_max_len,2] # 最终输出 (所有词的特征) input_dim = 0 inputs = [] # # 词的输入向量 # if word_dim: input_dim += word_dim with tf.device("/cpu:0"): word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(self.word_ids) inputs.append(word_input) # # 词性标注的输入向量 # if pos_dim: input_dim += pos_dim with tf.device("/cpu:0"): pos_layer = EmbeddingLayer(n_pos_tags, pos_dim, name='pos_layer') pos_input = pos_layer.link(self.pos_ids) inputs.append(pos_input) # # 字符的输入向量 # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') with tf.device("/cpu:0"): char_for_embedding_batch = char_layer.link(self.char_for_ids) char_rev_embedding_batch = char_layer.link(self.char_rev_ids) shape_for = tf.shape(char_for_embedding_batch) # reshape from [batch_size, word_max_len, char_max_len, char_dim] to [batch_size*word_max_len, char_max_len, char_dim] char_for_embedding = tf.reshape( char_for_embedding_batch, (shape_for[0] * shape_for[1], shape_for[2], shape_for[3])) shape_rev = tf.shape(char_rev_embedding_batch) char_rev_embedding = tf.reshape( char_rev_embedding_batch, (shape_rev[0] * shape_rev[1], shape_rev[2], shape_rev[3])) char_lstm_for_states = char_lstm_for.link(char_for_embedding) char_lstm_rev_states = char_lstm_rev.link(char_rev_embedding) char_lstm_for_h_trans = tf.transpose(char_lstm_for_states[1], (1, 0, 2), name='char_lstm_for_h_trans') char_lstm_rev_h_trans = tf.transpose(char_lstm_rev_states[1], (1, 0, 2), name='char_lstm_rev_h_trans') char_for_output = tf.gather_nd(char_lstm_for_h_trans, self.char_pos_ids, name='char_for_output') char_rev_output = tf.gather_nd(char_lstm_rev_h_trans, self.char_pos_ids, name='char_rev_output') char_for_output_batch = tf.reshape( char_for_output, (shape_for[0], shape_for[1], char_lstm_dim)) char_rev_output_batch = tf.reshape( char_rev_output, (shape_rev[0], shape_rev[1], char_lstm_dim)) inputs.append(char_for_output_batch) if char_bidirect: inputs.append(char_rev_output_batch) input_dim += char_lstm_dim inputs = tf.concat(inputs, axis=-1) # 在最终输出上加Dropout层 assert dropout < 1 and 0.0 <= dropout if dropout: input_train = tf.nn.dropout(inputs, 1 - dropout) if is_train: inputs = input_train # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=True, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=True, name='word_lstm_rev') # 前向隐藏层的输出 word_states_for = word_lstm_for.link(inputs) word_lstm_for_output = tf.transpose(word_states_for[1], (1, 0, 2), name='word_lstm_for_h_trans') # 后向隐藏层的输出 inputs_rev = tf.reverse_sequence(inputs, self.word_pos_ids, seq_dim=1, batch_dim=0) word_states_rev = word_lstm_rev.link(inputs_rev) word_lstm_rev_h_trans = tf.transpose(word_states_rev[1], (1, 0, 2), name='word_lstm_rev_h_trans') word_lstm_rev_output = tf.reverse_sequence(word_lstm_rev_h_trans, self.word_pos_ids, seq_dim=1, batch_dim=0) if word_bidirect: final_output = tf.concat( [word_lstm_for_output, word_lstm_rev_output], axis=-1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_lstm_for_output final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer') tags_scores = final_layer.link(final_output) if not crf: cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.tag_ids, logits=tags_scores, name='xentropy') cost = tf.reduce_mean(cross_entropy, name='xentropy_mean') else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) # for batch observation #def recurrence(prev, obs): # s_len = tf.shape(obs)[0] # obvs = tf.concat([obs, small * tf.ones((s_len, 2))], axis=1) # observations = tf.concat([b_s, obvs, e_s], axis=0) # return observations #tags_scores_shape = tf.shape(tags_scores) #obs_initial = tf.ones((tags_scores_shape[1] + 2, n_tags + 2)) #obs_batch = tf.scan(fn=recurrence, elems=tags_scores, initializer=obs_initial) # 计算标签的分数 def recurrence_real_score(prev, obs): tags_score = obs[0] tag_id_index_ = obs[1] tag_id_trans_ = obs[2] word_pos_ = obs[3] + 1 tags_score_slice = tags_score[0:word_pos_, :] tag_id_index_slice = tag_id_index_[0:word_pos_, :] tag_id_trans_slice = tag_id_trans_[0:(word_pos_ + 1), :] real_path_score = tf.reduce_sum( tf.gather_nd(tags_score_slice, tag_id_index_slice)) real_path_score += tf.reduce_sum( tf.gather_nd(transitions, tag_id_trans_slice)) return tf.reshape(real_path_score, []) real_path_score_list = tf.scan(fn=recurrence_real_score, elems=[ tags_scores, self.tag_id_index, self.tag_id_trans, self.word_pos_ids ], initializer=0.0) def recurrence_all_path(prev, obs): tags_score = obs[0] word_pos_ = obs[1] + 1 tags_score_slice = tags_score[0:word_pos_, :] s_len = tf.shape(tags_score_slice)[0] obvs = tf.concat( [tags_score_slice, small * tf.ones((s_len, 2))], axis=1) observations = tf.concat([b_s, obvs, e_s], axis=0) all_paths_scores = forward(observations, transitions) return tf.reshape(all_paths_scores, []) all_paths_scores_list = tf.scan( fn=recurrence_all_path, elems=[tags_scores, self.word_pos_ids], initializer=0.0) cost = -tf.reduce_mean(real_path_score_list - all_paths_scores_list) # 网络参数 if not crf: f_score = tf.nn.softmax(tags_scores) else: def recurrence_predict(prev, obs): tags_score = obs[0] word_pos_ = obs[1] + 1 tags_score_slice = tags_score[0:word_pos_, :] s_len = tf.shape(tags_score_slice)[0] obvs = tf.concat( [tags_score_slice, small * tf.ones((s_len, 2))], axis=1) observations = tf.concat([b_s, obvs, e_s], axis=0) all_paths_scores = forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True) all_paths_scores = tf.concat([ all_paths_scores, tf.zeros([tf.shape(tags_score)[0] - s_len], tf.int32) ], axis=0) return all_paths_scores f_score = tf.scan(fn=recurrence_predict, elems=[tags_scores, self.word_pos_ids], initializer=tf.zeros( [tf.shape(tags_scores)[1] + 2], tf.int32)) # 选择优化方法 tvars = tf.trainable_variables() grads = tf.gradients(cost, tvars) if clip_norm > 0: grads, _ = tf.clip_by_global_norm(grads, clip_norm) if lr_method == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr_rate) elif lr_method == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr_rate) elif lr_method == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(lr_rate) elif lr_method == 'adam': optimizer = tf.train.AdamOptimizer(lr_rate) elif lr_method == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr_rate) else: raise ("Not implemented learning method: %s" % lr_method) train_op = optimizer.apply_gradients(zip(grads, tvars)) # Tensorboard可视化cost的趋势 tf.summary.scalar('loss', cost) return cost, f_score, train_op
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, model_type, training=True, **kwargs): """ Build the network. """ # Training parameters layer_weighting = "fixed" n_words = len(self.id_to_word) n_chars = len(self.id_to_char) print "-------------------------------MODEL INFO---------------------------------------" print "** model_type", model_type print "** n_words, n_chars:", n_words, n_chars print "** self.feature_maps:" for f in self.feature_maps: print f["name"], f print "** self.tag_maps:" for tm in self.tag_maps: print tm print "---------------------------------------------------------------------------------" # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') features_ids = [] for f in self.feature_maps: features_ids.append(T.ivector(name=f['name'] + '_ids')) # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim print "** input_dim (input_dim += word_dim)", input_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # # Chars inputs # if char_dim: input_dim += char_lstm_dim print "** input_dim (input_dim += char_lstm_dim)", input_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim print "** input_dim (input_dim += char_lstm_dim: char_bidirect)", input_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim print "** input_dim (input_dim += cap_dim)", input_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) f_layers = [] for ilayer in range(len(self.feature_maps)): f = self.feature_maps[ilayer] input_dim += f['dim'] print "** input_dim (input_dim += f['dim'])", input_dim af_layer = EmbeddingLayer(len(f['id_to_ftag']), f['dim'], name=f['name'] + '_layer') f_layers.append(af_layer) inputs.append(af_layer.link(features_ids[ilayer])) # Prepare final input inputs = T.concatenate(inputs, axis=1) # inputs_nodropout = inputs # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) assert model_type in { "struct", "struct_mlp", "struct_mlp2", "multilayer", "single" } # Network parameters: Part 1 (Common parameters) params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) for af_layer in f_layers: self.add_component(af_layer) params.extend(af_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) if model_type == "multilayer" or model_type == "single": tags_scores_list = [] tag_ids_list = [] cost_list = [] observations_list = [] transitions_list = [] prev_input_dim = input_dim prev_ntags = 0 prev_tags_cores = None previous_inputs = inputs for ilayer in range(len(self.tag_maps)): inputs_i = previous_inputs if prev_tags_cores == None else T.concatenate( [previous_inputs, prev_tags_cores], axis=1) previous_inputs = inputs_i input_dim_i = prev_input_dim + prev_ntags print "input_dim_i for layer %d: %d" % (ilayer, input_dim_i) word_lstm_for_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_for' + str(ilayer)) word_lstm_rev_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_rev' + str(ilayer)) word_lstm_for_i.link(inputs_i) word_lstm_rev_i.link(inputs_i[::-1, :]) word_for_output_i = word_lstm_for_i.h word_rev_output_i = word_lstm_rev_i.h[::-1, :] if word_bidirect: final_output_i = T.concatenate( [word_for_output_i, word_rev_output_i], axis=1) tanh_layer_i = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer' + str(ilayer), activation='tanh') final_output_i = tanh_layer_i.link(final_output_i) else: final_output_i = word_for_output_i n_tags_i = len(self.tag_maps[ilayer]['id_to_tag']) final_layer_i = HiddenLayer( word_lstm_dim, n_tags_i, name='final_layer' + str(ilayer), activation=(None if crf else 'softmax')) tags_scores_i = final_layer_i.link(final_output_i) tag_ids_i = T.ivector(name='tag_ids' + str(ilayer)) # input tags of layer i # No CRF if not crf: cost_i = T.nnet.categorical_crossentropy( tags_scores_i, tag_ids_i).mean() # CRF else: transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer)) small1 = -1000 b_s1 = np.array([[small1] * n_tags_i + [0, small1] ]).astype(np.float32) e_s1 = np.array([[small1] * n_tags_i + [small1, 0] ]).astype(np.float32) observations_i = T.concatenate( [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1) observations_i = T.concatenate( [b_s1, observations_i, e_s1], axis=0) # Score from tags real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum() # Score from transitions b_id1 = theano.shared( value=np.array([n_tags_i], dtype=np.int32)) e_id1 = theano.shared( value=np.array([n_tags_i + 1], dtype=np.int32)) padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0) real_path_score1 += transitions_i[ padded_tags_ids1[T.arange(s_len + 1)], padded_tags_ids1[T.arange(s_len + 1) + 1]].sum() all_paths_scores1 = forward(observations_i, transitions_i) cost_i = -(real_path_score1 - all_paths_scores1) observations_list.append(observations_i) transitions_list.append(transitions_i) prev_input_dim = input_dim_i prev_ntags = n_tags_i prev_tags_cores = tags_scores_i * 1 cost_list.append(cost_i) # add cost of layer i into cost list tags_scores_list.append(tags_scores_i) tag_ids_list.append(tag_ids_i) # Network parameters: Part 2 (add parameters of mutilayer architectures) self.add_component(word_lstm_for_i) params.extend(word_lstm_for_i.params) #1 if word_bidirect: self.add_component(word_lstm_rev_i) params.extend(word_lstm_rev_i.params) #2 self.add_component(final_layer_i) params.extend(final_layer_i.params) #3 if crf: self.add_component(transitions_i) params.append(transitions_i) #4 if word_bidirect: self.add_component(tanh_layer_i) params.extend(tanh_layer_i.params) #5 # end for loop elif model_type == "struct" or model_type.startswith("struct_mlp"): # begin step 1: Using BI-LSTM to encode the sequence word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: lstm_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') lstm_output = tanh_layer.link(lstm_output) else: lstm_output = word_for_output # end step 1: final_output is the list of hidden states. Shapes of hidden state is prev_ntags = 0 tags_scores_list = [] prev_tags_cores = None final_layer_list = [] final_output = lstm_output mlp_list = [] if model_type == "struct": for ilayer in range(0, len(self.tag_maps)): n_tags_i = len(self.tag_maps[ilayer]['id_to_tag']) final_output = final_output if prev_tags_cores == None else T.concatenate( [final_output, prev_tags_cores], axis=1) final_layer_i = HiddenLayer( word_lstm_dim + prev_ntags, n_tags_i, name='final_layer_' + str(ilayer), activation=(None if crf else 'softmax')) tags_scores_i = final_layer_i.link(final_output) prev_ntags += n_tags_i prev_tags_cores = tags_scores_i tags_scores_list.append(tags_scores_i) final_layer_list.append(final_layer_i) elif model_type.startswith("struct_mlp"): for ilayer in range(0, len(self.tag_maps)): n_tags_i = len(self.tag_maps[ilayer]['id_to_tag']) final_output = final_output if prev_tags_cores == None else T.concatenate( [final_output, prev_tags_cores], axis=1) if model_type == "struct_mlp2": mlp_sizes = [ word_lstm_dim + prev_ntags, word_lstm_dim, word_lstm_dim ] else: mlp_sizes = [word_lstm_dim + prev_ntags, word_lstm_dim] mlp_input = final_output for j in range(len(mlp_sizes) - 1): mlp_layer = HiddenLayer(mlp_sizes[j], mlp_sizes[j + 1], name="mlp" + str(j + 1) + "_layer_" + str(ilayer), activation="tanh") mlp_input = mlp_layer.link(mlp_input) mlp_list.append(mlp_layer) final_layer_i = HiddenLayer( word_lstm_dim, n_tags_i, name='final_layer_' + str(ilayer), activation=(None if crf else 'softmax')) tags_scores_i = final_layer_i.link(mlp_input) # # unroll version # mlp1_layer_i = HiddenLayer(word_lstm_dim + prev_ntags, word_lstm_dim, # name="mlp1_layer_" + str(ilayer), activation="tanh") # mlp1_layer_i_out = mlp1_layer_i.link(final_output) # # mlp2_layer_i = HiddenLayer(word_lstm_dim, word_lstm_dim, # name="mlp2_layer_" + str(ilayer), activation="tanh") # mlp2_layer_i_out = mlp2_layer_i.link(mlp1_layer_i_out) # mlp_list.append(mlp1_layer_i) # mlp_list.append(mlp2_layer_i) # # final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer_' + str(ilayer), # activation=(None if crf else 'softmax')) # tags_scores_i = final_layer_i.link(mlp2_layer_i_out) prev_ntags += n_tags_i prev_tags_cores = tags_scores_i tags_scores_list.append(tags_scores_i) final_layer_list.append(final_layer_i) else: print(model_type, " is not exits !") raise # # unroll code # n_tags_0 = len(self.tag_maps[0]['id_to_tag']) # final_layer_0 = HiddenLayer(word_lstm_dim, n_tags_0, name='final_layer_0', activation=(None if crf else 'softmax')) # tags_scores_0 = final_layer_0.link(final_output) # # n_tags_1 = len(self.tag_maps[1]['id_to_tag']) # final_layer_1 = HiddenLayer(word_lstm_dim + n_tags_0, n_tags_1, name='final_layer_1', activation=(None if crf else 'softmax')) # final_output = T.concatenate( [final_output, tags_scores_0], axis=1 ) # tags_scores_1 = final_layer_1.link(final_output) # # n_tags_2 = len(self.tag_maps[2]['id_to_tag']) # final_layer_2 = HiddenLayer(word_lstm_dim + n_tags_0 + n_tags_1, n_tags_2, name='final_layer_2', # activation=(None if crf else 'softmax')) # final_output = T.concatenate([final_output, tags_scores_1], axis=1) # tags_scores_2 = final_layer_2.link(final_output) # tags_scores_list = [tags_scores_0, tags_scores_1, tags_scores_2] tag_ids_list = [] observations_list = [] transitions_list = [] cost_list = [] for ilayer in range(0, len(self.tag_maps)): tag_ids_i = T.ivector(name='tag_ids' + str(ilayer)) # input tags tag_ids_list.append(tag_ids_i) tags_scores_i = tags_scores_list[ilayer] n_tags_i = len(self.tag_maps[ilayer]['id_to_tag']) # No CRF if not crf: cost_i = T.nnet.categorical_crossentropy( tags_scores_i, tag_ids_i).mean() # CRF else: transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer)) small1 = -1000 b_s1 = np.array([[small1] * n_tags_i + [0, small1] ]).astype(np.float32) e_s1 = np.array([[small1] * n_tags_i + [small1, 0] ]).astype(np.float32) observations_i = T.concatenate( [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1) observations_i = T.concatenate( [b_s1, observations_i, e_s1], axis=0) # Score from tags real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum() # Score from transitions b_id1 = theano.shared( value=np.array([n_tags_i], dtype=np.int32)) e_id1 = theano.shared( value=np.array([n_tags_i + 1], dtype=np.int32)) padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0) real_path_score1 += transitions_i[ padded_tags_ids1[T.arange(s_len + 1)], padded_tags_ids1[T.arange(s_len + 1) + 1]].sum() all_paths_scores1 = forward(observations_i, transitions_i) cost_i = -(real_path_score1 - all_paths_scores1) observations_list.append(observations_i) transitions_list.append(transitions_i) cost_list.append(cost_i) # add cost of layer i into cost list # Network parameters: Part 2 (add parameters of struct architectures) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) for mlp_layer in mlp_list: self.add_component(mlp_layer) params.extend(mlp_layer.params) for final_layer in final_layer_list: self.add_component(final_layer) params.extend(final_layer.params) # # unroll code # self.add_component(final_layer_0) # params.extend(final_layer_0.params) # # self.add_component(final_layer_1) # params.extend(final_layer_1.params) # # self.add_component(final_layer_2) # params.extend(final_layer_2.params) if crf: for transitions in transitions_list: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # elif model_type == "multilayer_original": # print "** input_dim FOR LAYER 0 ", input_dim # # LSTM for words # word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, # name='word_lstm_for') # word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, # name='word_lstm_rev') # # word_lstm_for.link(inputs) # word_lstm_rev.link(inputs[::-1, :]) # word_for_output = word_lstm_for.h # word_rev_output = word_lstm_rev.h[::-1, :] # if word_bidirect: # final_output = T.concatenate( # [word_for_output, word_rev_output], # axis=1 # ) # tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, # name='tanh_layer', activation='tanh') # final_output = tanh_layer.link(final_output) # else: # final_output = word_for_output # # # Sentence to Named Entity tags - Score # n_tags = len(self.tag_maps[0]['id_to_tag']) # # final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', # activation=(None if crf else 'softmax')) # tags_scores = final_layer.link(final_output) # tag_ids = T.ivector(name='tag_ids0') # input tags of layer i # # # No CRF # if not crf: # cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # # CRF # else: # transitions = shared((n_tags + 2, n_tags + 2), 'transitions') # # small = -1000 # b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) # e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) # observations = T.concatenate( # [tags_scores, small * T.ones((s_len, 2))], # axis=1 # ) # observations = T.concatenate( # [b_s, observations, e_s], # axis=0 # ) # # # Score from tags # real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # # # Score from transitions # b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) # e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) # padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) # real_path_score += transitions[ # padded_tags_ids[T.arange(s_len + 1)], # padded_tags_ids[T.arange(s_len + 1) + 1] # ].sum() # # all_paths_scores = forward(observations, transitions) # cost = - (real_path_score - all_paths_scores) # # print "cost: ", cost # # Network parameters # # # self.add_component(word_lstm_for) # params.extend(word_lstm_for.params) #1 # # if word_bidirect: # self.add_component(word_lstm_rev) # params.extend(word_lstm_rev.params) #2 # # self.add_component(final_layer) # params.extend(final_layer.params) #3 # # if crf: # self.add_component(transitions) # params.append(transitions) #4 # # if word_bidirect: # self.add_component(tanh_layer) # params.extend(tanh_layer.params) #5 # # # # # layer 1 to n # # # tags_scores_list = [tags_scores] # tag_ids_list = [tag_ids] # cost_list = [cost] # observations_list = [observations] # transitions_list = [transitions] # prev_input_dim = input_dim # prev_ntags = n_tags # prev_tags_cores = tags_scores * 1 # # for ilayer in range(1, len(self.tag_maps)): # inputs_i = previous_inputs * 1 # inputs_i.append(prev_tags_cores) # previous_inputs = inputs_i * 1 # # inputs_i = T.concatenate(inputs_i, axis=1) # input_dim_i = prev_input_dim + prev_ntags # # word_lstm_for_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_for' + str(ilayer)) # word_lstm_rev_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_rev' + str(ilayer)) # word_lstm_for_i.link(inputs_i) # word_lstm_rev_i.link(inputs_i[::-1, :]) # word_for_output_i = word_lstm_for_i.h # word_rev_output_i = word_lstm_rev_i.h[::-1, :] # # if word_bidirect: # final_output_i = T.concatenate( # [word_for_output_i, word_rev_output_i], # axis=1 # ) # tanh_layer_i = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, # name='tanh_layer' + str(ilayer), activation='tanh') # final_output_i = tanh_layer_i.link(final_output_i) # else: # final_output_i = word_for_output_i # # n_tags_i = len(self.tag_maps[ilayer]['id_to_tag']) # # final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer' + str(ilayer), # activation=(None if crf else 'softmax')) # tags_scores_i = final_layer_i.link(final_output_i) # tags_scores_list.append(tags_scores_i) # tag_ids_i = T.ivector(name='tag_ids' + str(ilayer)) # input tags # tag_ids_list.append(tag_ids_i) # # # No CRF # if not crf: # cost_i = T.nnet.categorical_crossentropy(tags_scores_i, tag_ids_i).mean() # # CRF # else: # transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer)) # small1 = -1000 # b_s1 = np.array([[small1] * n_tags_i + [0, small1]]).astype(np.float32) # e_s1 = np.array([[small1] * n_tags_i + [small1, 0]]).astype(np.float32) # observations_i = T.concatenate([tags_scores_i, small1 * T.ones((s_len, 2))], axis=1) # observations_i = T.concatenate([b_s1, observations_i, e_s1], axis=0) # # # Score from tags # real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum() # # # Score from transitions # b_id1 = theano.shared(value=np.array([n_tags_i], dtype=np.int32)) # e_id1 = theano.shared(value=np.array([n_tags_i + 1], dtype=np.int32)) # padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0) # real_path_score1 += transitions_i[ # padded_tags_ids1[T.arange(s_len + 1)], # padded_tags_ids1[T.arange(s_len + 1) + 1] # ].sum() # # all_paths_scores1 = forward(observations_i, transitions_i) # # cost_i = - (real_path_score1 - all_paths_scores1) # # observations_list.append(observations_i) # transitions_list.append(transitions_i) # # prev_input_dim = input_dim_i # prev_ntags = n_tags_i # prev_tags_cores = tags_scores_i * 1 # cost_list.append(cost_i) # add cost of layer i into cost list # # # add parameters # # self.add_component(word_lstm_for_i) # params.extend(word_lstm_for_i.params) # # if word_bidirect: # self.add_component(word_lstm_rev_i) # params.extend(word_lstm_rev_i.params) # # self.add_component(final_layer_i) # params.extend(final_layer_i.params) # # if crf: # self.add_component(transitions_i) # params.append(transitions_i) # # if word_bidirect: # self.add_component(tanh_layer_i) # params.extend(tanh_layer_i.params) # # # end for loop if layer_weighting == "fixed": if len(self.tag_maps) == 2: cost_weights = np.array([0.4, 0.6]) elif len(self.tag_maps) == 3: cost_weights = np.array([0.4, 0.3, 0.3]) else: cost_weights = np.ones( (len(self.tag_maps), )) / len(self.tag_maps) costall = np.sum(cost_weights * np.array(cost_list)) else: # https://groups.google.com/forum/#!topic/theano-users/XDG6MM83grI weights = np.ones((len(self.tag_maps), )) / len(self.tag_maps) cost_weights = theano.shared(weights.astype(theano.config.floatX), name="layer_weights") layer_weights = theano.tensor.nnet.sigmoid(cost_weights) params.extend([cost_weights]) xx = theano.tensor.mul(layer_weights, theano.tensor.as_tensor_variable(cost_list)) costall = theano.tensor.sum(xx) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) for ilayer in range(len(self.feature_maps)): eval_inputs.append(features_ids[ilayer]) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + tag_ids_list print "-- train_inputs: ", print train_inputs # [word_ids, pos_ids, chunk_ids, wh_ids, if_ids, s_ids, tag_ids, tag_ids1, tag_ids2] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: # print "train_inputs[9]", train_inputs[9] print "-- len(cost_list): ", len(cost_list) updates = Optimization(clip=5.0).get_updates( lr_method_name, costall, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=costall, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function tags_scores_out = tags_scores_list print "-- len(tags_scores_list): ", len(tags_scores_list) if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores_out, givens=({ is_train: np.cast['int32'](0) } if dropout else {}) #, # on_unused_input='ignore' ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward_n(zip(observations_list, transitions_list), viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {}) #, # on_unused_input='ignore' ) from pprint import pprint print "--------------------------------------------------------------" pprint(self.components) return f_train, f_eval # return f_train, f_eval, f_test
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 #for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) #s_len # of words in sentence observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) #add padding to exist tag_scores(sentencelength * tag_ids) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def ready(self): args = self.args embedding_layer = self.embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX)) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.y = T.ivector('y') x = self.x y = self.y n_hidden = self.n_hidden n_in = self.n_in # fetch word embeddings # (len * batch_size) * n_in slices = embedding_layer.forward(x.ravel()) self.slices = slices # 3-d tensor, len * batch_size * n_in slices = slices.reshape((x.shape[0], x.shape[1], n_in)) # stacking the feature extraction layers pooling = args.pooling depth = args.depth layers = self.layers = [] prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 softmax_inputs = [] activation = get_activation_by_name(args.act) for i in range(depth): if args.layer.lower() == "lstm": layer = LSTM(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden) elif args.layer.lower() == "strcnn": layer = StrCNN(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden, activation=activation, decay=args.decay, order=args.order) elif args.layer.lower() == "rcnn": layer = RCNN(n_in=n_hidden if i > 0 else n_in, n_out=n_hidden, activation=activation, order=args.order, mode=args.mode) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) if pooling: softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns else: softmax_inputs.append(prev_output[-1]) prev_output = apply_dropout(prev_output, dropout) size += n_hidden # final feature representation is the concatenation of all extraction layers if pooling: softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0] else: softmax_input = T.concatenate(softmax_inputs, axis=1) softmax_input = apply_dropout(softmax_input, dropout, v2=True) # feed the feature repr. to the softmax output layer layers.append( Layer(n_in=size, n_out=self.nclasses, activation=softmax, has_bias=False)) for l, i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out)) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean( T.nnet.categorical_crossentropy(self.p_y_given_x, y)) # adding regularizations self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self): args = self.args embedding_layer = self.embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.y = T.ivector('y') x = self.x y = self.y n_hidden = self.n_hidden n_in = self.n_in # fetch word embeddings # (len * batch_size) * n_in slices = embedding_layer.forward(x.ravel()) self.slices = slices # 3-d tensor, len * batch_size * n_in slices = slices.reshape( (x.shape[0], x.shape[1], n_in) ) # stacking the feature extraction layers pooling = args.pooling depth = args.depth layers = self.layers = [ ] prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 softmax_inputs = [ ] activation = get_activation_by_name(args.act) for i in range(depth): if args.layer.lower() == "lstm": layer = LSTM( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden ) elif args.layer.lower() == "strcnn": layer = StrCNN( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden, activation = activation, decay = args.decay, order = args.order ) elif args.layer.lower() == "rcnn": layer = RCNN( n_in = n_hidden if i > 0 else n_in, n_out = n_hidden, activation = activation, order = args.order, mode = args.mode ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) if pooling: softmax_inputs.append(T.sum(prev_output, axis=0)) # summing over columns else: softmax_inputs.append(prev_output[-1]) prev_output = apply_dropout(prev_output, dropout) size += n_hidden # final feature representation is the concatenation of all extraction layers if pooling: softmax_input = T.concatenate(softmax_inputs, axis=1) / x.shape[0] else: softmax_input = T.concatenate(softmax_inputs, axis=1) softmax_input = apply_dropout(softmax_input, dropout, v2=True) # feed the feature repr. to the softmax output layer layers.append( Layer( n_in = size, n_out = self.nclasses, activation = softmax, has_bias = False ) ) for l,i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean( T.nnet.categorical_crossentropy( self.p_y_given_x, y )) # adding regularizations self.l2_sqr = None self.params = [ ] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams))
def ready(self): args = self.args embedding_layer = self.embedding_layer user_embedding_layer = self.user_embedding_layer self.n_hidden = args.hidden_dim self.n_in = embedding_layer.n_d dropout = self.dropout = theano.shared( np.float64(args.dropout_rate).astype(theano.config.floatX) ) # x is length * batch_size # y is batch_size self.x = T.imatrix('x') self.w_masks = T.fmatrix('mask') self.w_lens = T.fvector('lens') self.s_ml = T.iscalar('sent_maxlen') self.s_num = T.iscalar('sent_num') self.y = T.ivector('y') self.usr = T.ivector('users') x = self.x y = self.y usr = self.usr w_masks = self.w_masks w_lens = self.w_lens s_ml = self.s_ml s_num = self.s_num n_hidden = self.n_hidden n_emb = n_in = self.n_in layers = self.layers = [] slicesu = user_embedding_layer.forward(usr) slices = embedding_layer.forward(x.ravel()) self.slices = slices # important for updating word embeddings # 3-d tensor, len * batch_size * n_in slices = slices.reshape((x.shape[0], x.shape[1], n_in)) pooling = args.pooling prev_output = slices prev_output = apply_dropout(prev_output, dropout, v2=True) size = 0 n_hidden_t = n_hidden if args.direction == "bi": n_hidden_t = 2 * n_hidden softmax_inputs = [] activation = get_activation_by_name(args.act) if args.layer.lower() == "lstm": layer = LSTM(n_in=n_in, n_out=n_hidden_t, direction=args.direction ) elif args.layer.lower() == "cnn": layer = CNN(n_in=n_in, n_out=n_hidden_t, activation=activation, order=args.order ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output, masks=w_masks) prev_output = apply_dropout(prev_output, dropout) # final feature representation is the concatenation of all extraction layers if args.user_atten: layer = IterAttentionLayer( n_in=n_emb, n_out=n_hidden_t ) layers.append(layer) if args.user_atten_base: slicesu = None softmax_input = layers[-1].multi_hop_forward( prev_output, user_embs=slicesu, isWord=True, masks=w_masks) else: if pooling: softmax_input = T.sum(prev_output, axis=0) / w_lens.dimshuffle(0, 'x') else: ind = T.cast(w_lens - T.ones_like(w_lens), 'int32') softmax_input = prev_output[T.arange(ind.shape[0]), ind] softmax_input = apply_dropout(softmax_input, dropout, v2=True) n_in = n_hidden_t size = 0 softmax_inputs = [] [sentlen, emblen] = T.shape(softmax_input) prev_output = softmax_input.reshape( (sentlen / s_num, s_num, emblen)).dimshuffle(1, 0, 2) if args.layer.lower() == "lstm": layer = LSTM(n_in=n_in, n_out=n_hidden_t, direction=args.direction ) elif args.layer.lower() == "cnn": layer = CNN(n_in=n_in, n_out=n_hidden_t, activation=activation, order=args.order, ) else: raise Exception("unknown layer type: {}".format(args.layer)) layers.append(layer) prev_output = layer.forward_all(prev_output) prev_output = apply_dropout(prev_output, dropout) if args.user_atten: layer = IterAttentionLayer( n_in=n_emb, n_out=n_hidden_t ) layers.append(layer) if args.user_atten_base: slicesu = None softmax_input = layers[-1].multi_hop_forward( prev_output, user_embs=slicesu, isWord=False) else: if pooling: softmax_input = T.sum(prev_output, axis=0) / \ T.cast(s_num, 'float32') else: softmax_input = prev_output[-1] softmax_input = apply_dropout(softmax_input, dropout, v2=True) size = n_hidden_t layers.append(Layer( n_in=size, n_out=self.nclasses, activation=softmax, has_bias=False )) if not args.fix_emb: for l, i in zip(layers, range(len(layers))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) else: for l, i in zip(layers[1:], range(len(layers[1:]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) # unnormalized score of y given x self.p_y_given_x = layers[-1].forward(softmax_input) self.pred = T.argmax(self.p_y_given_x, axis=1) self.nll_loss = T.mean(T.nnet.categorical_crossentropy( self.p_y_given_x, y )) # adding regularizations self.l2_sqr = None self.params = [] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) nparams = sum(len(x.get_value(borrow=True).ravel()) for x in self.params) say("total # parameters: {}\n".format(nparams))
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') cap_ids = T.ivector(name='cap_ids') # Sentence length # Final input (all word features) input_dim = 0 inputs = [] s_len = (char_pos_ids).shape[0] # # # Chars inputs # input_dim += (char_lstm_dim * 2) char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=False, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=False, name='char_lstm_rev') char_lstm_for.link(char_layer.link(word_ids)) char_lstm_rev.link(char_layer.link(cap_ids)) final_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_layer', activation=('softmax')) chars_final = final_layer.link(char_lstm_for.h) final_rev_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_rev_layer', activation=('softmax')) chars_rev_final = final_layer.link(char_lstm_rev.h) cost_chars = T.nnet.categorical_crossentropy(chars_final, char_pos_ids).mean() cost_chars_rev = T.nnet.categorical_crossentropy( chars_rev_final, tag_ids).mean() # Network parameters params = [] if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) #if cap_dim: eval_inputs.append(tag_ids) eval_inputs.append(cap_ids) # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Fetch gradients from both char_lstms gradients = T.grad(cost_chars, char_lstm_for.params) gradients_rev = T.grad(cost_chars_rev, char_lstm_rev.params) # Return forward char_lstm grads f_eval = theano.function(inputs=eval_inputs, outputs=gradients, givens=({ is_train: np.cast['int32'](0) } if dropout else {}), on_unused_input='ignore') # Return reverse char_lstm grads f_eval_rev = theano.function(inputs=eval_inputs, outputs=gradients_rev, givens=({ is_train: np.cast['int32'](0) } if dropout else {}), on_unused_input='ignore') return f_eval, f_eval_rev
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, word_to_id=None, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 6 if self.parameters['pos_dim']: n_pos = len(self.id_to_pos) if self.parameters['ortho_dim']: n_ortho = len(self.id_to_ortho) if self.parameters['multi_task']: n_segment_tags = len(self.id_to_segment) if self.parameters['pre_emb_1_dim']: n_words_1 = len(self.id_to_word_1) # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') if self.parameters['pos_dim']: pos_ids = T.ivector(name='pos_ids') if self.parameters['ortho_dim']: ortho_ids = T.ivector(name='ortho_ids') if self.parameters['multi_task']: segment_tags_ids = T.ivector(name='segment_tags_ids') if self.parameters['pre_emb_1_dim']: word_ids_1 = T.ivector(name='doc_ids_dn') if self.parameters['language_model']: y_fwd_ids = T.ivector(name='y_fwd_ids') y_bwd_ids = T.ivector(name='y_bwd_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim print('word_dim: {}'.format(word_dim)) word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training and not self.parameters['reload']: new_weights = word_layer.embeddings.get_value() print( 'Loading pretrained embeddings from {}...'.format(pre_emb)) pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print('WARNING: {} invalid lines'.format(emb_invalid)) c_found = 0 c_lower = 0 c_zeros = 0 oov_words = 0 if self.parameters['emb_of_unk_words']: # TODO # add path as a parameter fast_text_model_p = '/home/ubuntu/usama_ws/resources/Spanish-Corporas/embeddings/fasttext/' \ 'fasttext-100d.bin' ft_model = load_model(fast_text_model_p) # Lookup table initialization for i in range(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 else: if self.parameters['emb_of_unk_words']: new_weights[i] = ft_model.get_word_vector(word) oov_words += 1 # set row corresponding to padding token to 0 new_weights[word_to_id['<PADDING>']] = np.zeros(word_dim) word_layer.embeddings.set_value(new_weights) print('Loaded {} pretrained embeddings.'.format( len(pretrained))) print('{} / {} ({} percent) words have been initialized with ' 'pretrained embeddings.'.format( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words)) print('{} found directly, {} after lowercasing, ' '{} after lowercasing + zero.'.format( c_found, c_lower, c_zeros)) print('oov words count: {}'.format(oov_words)) # # Word inputs # if self.parameters['pre_emb_1']: print('pre_emb_1_dim: {}'.format(self.parameters['pre_emb_1_dim'])) input_dim += self.parameters['pre_emb_1_dim'] word_layer_1 = EmbeddingLayer(n_words_1, word_dim, name='word_layer_1') word_input_1 = word_layer_1.link(word_ids_1) inputs.append(word_input_1) if training and not self.parameters['reload']: # Initialize with pretrained embeddings new_weights_1 = word_layer_1.embeddings.get_value() print('Loading pretrained embeddings from {}...'.format( self.parameters['pre_emb_1'])) pretrained_1 = {} emb_invalid_1 = 0 for i, line in enumerate( codecs.open(self.parameters['pre_emb_1'], 'r', 'utf-8')): line = line.rstrip().split() if len(line) == self.parameters['pre_emb_1_dim'] + 1: pretrained_1[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid_1 += 1 if emb_invalid_1 > 0: print('WARNING: {} invalid lines'.format(emb_invalid_1)) c_found = 0 c_lower = 0 c_zeros = 0 oov_words = 0 # Lookup table initialization for i in range(n_words_1): word_1 = self.id_to_word_1[i] if word_1 in pretrained_1: new_weights_1[i] = pretrained_1[word_1] c_found += 1 elif word_1.lower() in pretrained_1: new_weights_1[i] = pretrained_1[word_1.lower()] c_lower += 1 elif re.sub('\d', '0', word_1.lower()) in pretrained_1: new_weights_1[i] = pretrained_1[re.sub( '\d', '0', word_1.lower())] c_zeros += 1 else: oov_words += 1 word_layer_1.embeddings.set_value(new_weights_1) print('Loaded {} pretrained embeddings.'.format( len(pretrained_1))) print('{} / {} ({} percent) words have been initialized with ' 'pretrained embeddings.'.format( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words)) print('{} found directly, {} after lowercasing, ' '{} after lowercasing + zero.'.format( c_found, c_lower, c_zeros)) print('oov words count: {}'.format(oov_words)) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) if self.parameters['pos_dim']: input_dim += self.parameters['pos_dim'] pos_layer = EmbeddingLayer(n_pos, self.parameters['pos_dim'], name='pos_layer') inputs.append(pos_layer.link(pos_ids)) # zeroing the '<UNK>' pos tag row # loading reverse mappings pos_to_id = {y: x for x, y in self.id_to_pos.items()} unk_idx = pos_to_id['<UNK>'] _pos_wts = pos_layer.embeddings.get_value() _pos_wts[unk_idx] = [0.] * self.parameters['pos_dim'] pos_layer.embeddings.set_value(_pos_wts) if self.parameters['ortho_dim']: input_dim += self.parameters['ortho_dim'] ortho_layer = EmbeddingLayer(n_ortho, self.parameters['ortho_dim'], name='ortho_layer') inputs.append(ortho_layer.link(ortho_ids)) ortho_to_id = {y: x for x, y in self.id_to_ortho.items()} unk_idx = ortho_to_id['<UNK>'] _pos_wts = ortho_layer.embeddings.get_value() _pos_wts[unk_idx] = [0.] * self.parameters['ortho_dim'] ortho_layer.embeddings.set_value(_pos_wts) print('input_dim: {}'.format(input_dim)) # Prepare final input inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: n_h = 2 * word_lstm_dim final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(n_h, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) if self.parameters['multi_task']: # Sentence to Named Entity Segmentation tags - Score segment_layer = HiddenLayer( word_lstm_dim, n_segment_tags, name='segment_layer', activation=(None if crf else 'softmax')) segment_tags_scores = segment_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() if self.parameters['multi_task']: cost_segment = T.nnet.categorical_crossentropy( segment_tags_scores, segment_tags_ids).mean() cost += cost_segment # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) if self.parameters['multi_task']: segment_transitions = shared( (n_segment_tags + 2, n_segment_tags + 2), 'segment_transitions') seg_small = -1000 seg_b_s = np.array([[seg_small] * n_segment_tags + [0, seg_small]]).astype(np.float32) seg_e_s = np.array([[seg_small] * n_segment_tags + [seg_small, 0]]).astype(np.float32) segment_observations = T.concatenate( [segment_tags_scores, seg_small * T.ones((s_len, 2))], axis=1) segment_observations = T.concatenate( [seg_b_s, segment_observations, seg_e_s], axis=0) # Score from tags seg_real_path_score = segment_tags_scores[ T.arange(s_len), segment_tags_ids].sum() # Score from transitions seg_b_id = theano.shared( value=np.array([n_segment_tags], dtype=np.int32)) seg_e_id = theano.shared( value=np.array([n_segment_tags + 1], dtype=np.int32)) seg_padded_tags_ids = T.concatenate( [seg_b_id, segment_tags_ids, seg_e_id], axis=0) seg_real_path_score += segment_transitions[ seg_padded_tags_ids[T.arange(s_len + 1)], seg_padded_tags_ids[T.arange(s_len + 1) + 1]].sum() seg_all_paths_scores = forward(segment_observations, segment_transitions) cost_segment = -(seg_real_path_score - seg_all_paths_scores) cost += cost_segment if training and self.parameters['ranking_loss']: def recurrence(x_t, y_t): token_prob_pos = x_t[y_t] arg_max_1 = T.argmax(x_t) arg_max_2 = T.argsort(-x_t)[1] token_prob_neg = ifelse(T.eq(y_t, arg_max_1), x_t[arg_max_2], x_t[arg_max_1]) cost_t = T.max([0, 1.0 - token_prob_pos + token_prob_neg]) return cost_t cost_r, _ = theano.scan(recurrence, sequences=[tags_scores, tag_ids]) cum_cost = T.sum(cost_r) cost += cum_cost if self.parameters['language_model']: lm_fwd_layer = HiddenLayer(word_lstm_dim, n_words, name='lm_fwd_layer', activation='softmax') lm_fwd_scores = lm_fwd_layer.link(final_output) lm_fwd_cost = T.nnet.categorical_crossentropy( lm_fwd_scores, y_fwd_ids).mean() lm_bwd_layer = HiddenLayer(word_lstm_dim, n_words, name='lm_bwd_layer', activation='softmax') lm_bwd_scores = lm_bwd_layer.link(final_output) lm_bwd_cost = T.nnet.categorical_crossentropy( lm_bwd_scores, y_bwd_ids).mean() cost_lm = lm_fwd_cost + lm_bwd_cost cost += cost_lm # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if self.parameters['pre_emb_1']: self.add_component(word_layer_1) params.extend(word_layer_1.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) if self.parameters['pos_dim']: self.add_component(pos_layer) params.extend(pos_layer.params) if self.parameters['ortho_dim']: self.add_component(ortho_layer) params.extend(ortho_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if self.parameters['multi_task']: self.add_component(segment_layer) params.extend(segment_layer.params) if crf: self.add_component(transitions) params.append(transitions) if self.parameters['multi_task']: self.add_component(segment_transitions) params.append(segment_transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) if self.parameters['language_model']: self.add_component(lm_fwd_layer) params.extend(lm_fwd_layer.params) self.add_component(lm_bwd_layer) params.extend(lm_bwd_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) if self.parameters['pos_dim']: eval_inputs.append(pos_ids) if self.parameters['ortho_dim']: eval_inputs.append(ortho_ids) if self.parameters['pre_emb_1']: eval_inputs.append(word_ids_1) train_inputs = eval_inputs + [tag_ids] if self.parameters['multi_task']: train_inputs += [segment_tags_ids] if self.parameters['language_model']: train_inputs.append(y_fwd_ids) train_inputs.append(y_bwd_ids) # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print('Compiling...') if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {}), allow_input_downcast=True) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {}), allow_input_downcast=True) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {}), allow_input_downcast=True) return f_train, f_eval
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags_loaded = len(self.id_to_tag_old) n_tags = len(self.id_to_tag) print "n_words: ", n_words, "n_chars: ", n_chars, "n_tags_loaded: ", n_tags_loaded, "n_tags(new ones): ", n_tags print self.id_to_tag print self.id_to_tag_old # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer_init = HiddenLayer(word_lstm_dim, n_tags_loaded, name='final_layer', activation=(None)) tags_loaded_scores = final_layer_init.link(final_output) print word_lstm_dim+n_tags_loaded final_layer = HiddenLayer(word_lstm_dim+n_tags_loaded, n_tags, name='final_layer_new', activation=('softmax')) final_out_new = T.concatenate([final_output, tags_loaded_scores], axis=1) tags_scores = final_layer.link(final_out_new) # No CRF cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) return f_train, f_eval
def build(self, parameters): #{{{ """ Build the network. """ #some parameters dropout = parameters['dropout'] char_dim = parameters['char_dim'] char_lstm_dim = parameters['char_lstm_dim'] char_bidirect = parameters['char_bidirect'] word_dim = parameters['word_dim'] word_lstm_dim = parameters['word_lstm_dim'] word_bidirect = parameters['word_bidirect'] lr_method = parameters['lr_method'] pre_emb = parameters['pre_emb'] crf = parameters['crf'] cap_dim = parameters['cap_dim'] training = parameters['training'] features = parameters['features'] # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) self.output_dim = len(self.id_to_tag) self.transitions = shared((self.output_dim + 1, self.output_dim), 'transitions') # Number of capitalization features if cap_dim: n_cap = 4 if features is not None and features['lemma']['isUsed']: lemma_ids = T.ivector(name='lemma_ids') if features is not None and features['pos']['isUsed']: pos_ids = T.ivector(name='pos_ids') if features is not None and features['chunk']['isUsed']: chunk_ids = T.ivector(name='chunk_ids') if features is not None and features['NER']['isUsed']: dic_ids = T.ivector(name='dic_ids') # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # Word inputs #{{{ if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) #for attention inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros) #}}} # Chars inputs #{{{ if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim #}}} # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: #all_paths_scores = forward(observations, self.transitions) #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores) #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ; #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5); #cost=-error; #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) real_path_score += self.transitions[ padded_tags_ids[T.arange(s_len)], padded_tags_ids[T.arange(s_len) + 1]].sum() all_paths_scores = forward(tags_scores, self.transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(self.transitions) params.append(self.transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: import optimizers self.optimizer = optimizers.RMSprop(lr=0.001) updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) self.constraints = {} #updates = self.optimizer.get_updates(params,self.constraints,cost); f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) #for debug #f_Debug = theano.function( # inputs=train_inputs, # outputs=cost, # updates=self.update, # givens=({is_train: np.cast['int32'](1)} if dropout else {}) #) #debug end else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( tags_scores, self.transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def build4(self, parameters): #{{{ """ Build the network. """ #some parameters dropout = parameters['dropout'] char_dim = parameters['char_dim'] char_lstm_dim = parameters['char_lstm_dim'] char_bidirect = parameters['char_bidirect'] word_dim = parameters['word_dim'] word_lstm_dim = parameters['word_lstm_dim'] word_bidirect = parameters['word_bidirect'] lr_method = parameters['lr_method'] pre_emb = parameters['pre_emb'] crf = parameters['crf'] cap_dim = parameters['cap_dim'] training = parameters['training'] features = parameters['features'] useAttend = parameters['useAttend'] if useAttend: reloadParam = parameters['loading'] else: reloadParam = None if reloadParam is not None: reloadPath = parameters['loading_path'] sentencesLevelLoss = parameters['sentencesLevelLoss'] # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) self.output_dim = len(self.id_to_tag) self.transitions = shared((self.output_dim + 1, self.output_dim), 'transitions') # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') wordTrue_ids = T.ivector(name='wordTrue_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') docLen = T.ivector(name='docLen') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') #some features if features is not None and features['lemma']['isUsed']: lemma_ids = T.ivector(name='lemma_ids') if features is not None and features['pos']['isUsed']: pos_ids = T.ivector(name='pos_ids') if features is not None and features['chunk']['isUsed']: chunk_ids = T.ivector(name='chunk_ids') if features is not None and features['dic']['isUsed']: dic_ids = T.ivector(name='dic_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # Word inputs #{{{ if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) wordTrue_input = word_layer.link(wordTrue_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros) #}}} # Chars inputs #{{{ if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_output = T.concatenate([char_for_output, char_rev_output], axis=-1) inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim #}}} # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) #add feature #{{{ if features is not None and features['lemma']['isUsed']: lemma_layer = EmbeddingLayer(features['lemma']['num'], features['lemma']['dim'], name='lemma_layer') if features['lemma']['pre_emb'] is not "": new_weights = lemma_layer.embeddings.get_value() loadPreEmbFeatures(features['lemma']['pre_emb'], features['feature_to_id_map']['lemma'], new_weights, lower=True) lemma_layer.embeddings.set_value(new_weights) lemma_output = lemma_layer.link(lemma_ids) if features['lemma']['lstm-input']: input_dim += features['lemma']['dim'] inputs.append(lemma_output) if features is not None and features['pos']['isUsed']: pos_layer = EmbeddingLayer(features['pos']['num'], features['pos']['dim'], name='pos_layer') if features['pos']['pre_emb'] is not "": new_weights = pos_layer.embeddings.get_value() loadPreEmbFeatures(features['pos']['pre_emb'], features['feature_to_id_map']['pos'], new_weights) pos_layer.embeddings.set_value(new_weights) pos_output = pos_layer.link(pos_ids) if features['pos']['lstm-input']: input_dim += features['pos']['dim'] inputs.append(pos_output) if features is not None and features['chunk']['isUsed']: chunk_layer = EmbeddingLayer(features['chunk']['num'], features['chunk']['dim'], name='chunk_layer') chunk_output = chunk_layer.link(chunk_ids) if features['chunk']['lstm-input']: input_dim += features['chunk']['dim'] inputs.append(chunk_output) if features is not None and features['dic']['isUsed']: dic_layer = EmbeddingLayer(features['dic']['num'], features['dic']['dim'], name='dic_layer') dic_output = dic_layer.link(dic_ids) if features['dic']['lstm-input']: input_dim += features['dic']['dim'] inputs.append(dic_output) #}}} # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') if sentencesLevelLoss: def sentLSTM(i, output, input, lenVec): #{{{ Len = lenVec[i] accLen = lenVec[:i].sum() currentInput = input[accLen:accLen + Len] word_lstm_for.link(currentInput) word_lstm_rev.link(currentInput[::-1, :]) wordForOutput = word_lstm_for.h wordRevOutput = word_lstm_rev.h[::-1, :] finalOutput = T.concatenate([wordForOutput, wordRevOutput], axis=-1) output = T.set_subtensor(output[accLen:accLen + Len], finalOutput) return output #}}} result, update = theano.scan( fn=sentLSTM, outputs_info=T.zeros((inputs.shape[0], word_lstm_dim * 2), dtype='float32'), sequences=[T.arange(docLen.shape[0])], non_sequences=[inputs, docLen]) word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_for_c = word_lstm_for.c word_rev_output = word_lstm_rev.h[::-1, :] word_rev_c = word_lstm_rev.c[::-1, :] final_c = T.concatenate([word_for_c, word_rev_c], axis=-1) final_output = result[-1] else: word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_for_c = word_lstm_for.c word_rev_output = word_lstm_rev.h[::-1, :] word_rev_c = word_lstm_rev.c[::-1, :] final_output = T.concatenate([word_for_output, word_rev_output], axis=-1) final_c = T.concatenate([word_for_c, word_rev_c], axis=-1) if useAttend: #attention layer attended = [] attendedDim = 0 if features is not None and features['word']['attended']: attended.append(wordTrue_input) attendedDim += word_dim if features is not None and features['char']['attended']: attended.append(char_output) attendedDim += char_lstm_dim * 2 if features is not None and features['lemma']['attended']: attended.append(lemma_output) attendedDim += features['lemma']['dim'] if features is not None and features['pos']['attended']: attended.append(pos_output) attendedDim += features['pos']['dim'] if features is not None and features['chunk']['attended']: attended.append(chunk_output) attendedDim += features['chunk']['dim'] if features is not None and features['dic']['attended']: attended.append(dic_output) attendedDim += features['dic']['dim'] attention_layer = AttentionLayer( attended_dim=attendedDim, state_dim=attendedDim, #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2, # state_dim=word_lstm_dim*2, source_dim=word_lstm_dim * 2, scoreFunName=parameters['attenScoreFun'], name='attention_layer') if len(attended) > 1: attendedInput = T.concatenate(attended, axis=-1) else: attendedInput = attended[0] final_output = attention_layer.link(attendedInput, attendedInput, final_output) #using lstm_state to compute attention #final_output=attention_layer.link(final_output,final_c,final_output); self.energy = attention_layer.energy else: final_output = final_output tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: if sentencesLevelLoss: #calcuate loss according to sentence instead of docLen def sentLoss(i, scores, trueIds, transitions, lenVec): #{{{ Len = lenVec[i] accLen = lenVec[:i].sum() currentTagsScores = scores[accLen:accLen + Len] currentIds = trueIds[accLen:accLen + Len] real_path_score = currentTagsScores[T.arange(Len), currentIds].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags], currentIds], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(Len)], padded_tags_ids[T.arange(Len) + 1]].sum() all_paths_scores = forward(currentTagsScores, transitions) cost = -(real_path_score - all_paths_scores) return cost #}}} result, update = theano.scan( fn=sentLoss, outputs_info=None, sequences=[T.arange(docLen.shape[0])], non_sequences=[ tags_scores, tag_ids, self.transitions, docLen ]) cost = result.sum() else: real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) real_path_score += self.transitions[ padded_tags_ids[T.arange(s_len)], padded_tags_ids[T.arange(s_len) + 1]].sum() all_paths_scores = forward(tags_scores, self.transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(self.transitions) params.append(self.transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) #add feature layer if features is not None and features['lemma']['isUsed']: self.add_component(lemma_layer) params.extend(lemma_layer.params) if features is not None and features['pos']['isUsed']: self.add_component(pos_layer) params.extend(pos_layer.params) if features is not None and features['chunk']['isUsed']: self.add_component(chunk_layer) params.extend(chunk_layer.params) if features is not None and features['dic']['isUsed']: self.add_component(dic_layer) params.extend(dic_layer.params) if useAttend and reloadParam: #reload pre-train params model_path = self.model_path self.model_path = reloadPath print "loading:", self.model_path self.reload(features) self.model_path = model_path if useAttend: #add attention_layer self.add_component(attention_layer) params.extend(attention_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) if useAttend: eval_inputs.append(wordTrue_ids) if sentencesLevelLoss: eval_inputs.append(docLen) #add feature input if features is not None and features['lemma']['isUsed']: eval_inputs.append(lemma_ids) if features is not None and features['pos']['isUsed']: eval_inputs.append(pos_ids) if features is not None and features['chunk']['isUsed']: eval_inputs.append(chunk_ids) if features is not None and features['dic']['isUsed']: eval_inputs.append(dic_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: #constraints if useAttend: self.constraints = attention_layer.constraints else: self.constraints = {} from keras import optimizers self.optimizer = optimizers.SGD(lr=0.001, momentum=0.9, decay=0., nesterov=True, clipvalue=5) self.optimizer = optimizers.RMSprop() #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01) updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, constraints=self.constraints, **lr_method_parameters) #updates = self.optimizer.get_updates(params,self.constraints,cost); f_train_outputs = [cost] if useAttend: f_train_outputs.append(self.energy) f_train = theano.function(inputs=train_inputs, outputs=f_train_outputs, updates=updates, on_unused_input='ignore', givens=({ is_train: np.cast['int32'](1) } if dropout else {})) f_test = theano.function(inputs=train_inputs, outputs=cost, on_unused_input='ignore', givens=({ is_train: np.cast['int32'](0) } if dropout else {})) self.f_test = f_test else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: if sentencesLevelLoss: def sentVitebe(i, predictTag, scores, transitions, lenVec): #{{{ Len = lenVec[i] accLen = lenVec[:i].sum() currentTagsScores = scores[accLen:accLen + Len] currentPredictIds = forward(currentTagsScores, transitions, viterbi=True, return_alpha=False, return_best_sequence=True) predictTag = T.set_subtensor( predictTag[accLen:accLen + Len], currentPredictIds) return predictTag #}}} predictTag, update = theano.scan( fn=sentVitebe, outputs_info=T.zeros((tags_scores.shape[0], ), dtype='int32'), sequences=[T.arange(docLen.shape[0])], non_sequences=[tags_scores, self.transitions, docLen]) predictTag = predictTag[-1] else: predictTag = forward(tags_scores, self.transitions, viterbi=True, return_alpha=False, return_best_sequence=True) f_eval = theano.function(inputs=eval_inputs, outputs=predictTag, on_unused_input='ignore', givens=({ is_train: np.cast['int32'](0) } if dropout else {})) #f_AttenVisual=theano.function( # inputs=eval_inputs, # outputs=[predictTag,self.energy], # on_unused_input='ignore', # givens=({is_train: np.cast['int32'](0)} if dropout else {}) # ) #self.f_AttenVisual=f_AttenVisual; return f_train, f_eval
def ready(self, args, train): # len * batch self.idxs = T.imatrix() self.idys = T.imatrix() self.init_state = T.matrix(dtype=theano.config.floatX) dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) self.dropout = theano.shared(dropout_prob) self.n_d = args["hidden_dim"] embedding_layer = EmbeddingLayer( n_d = self.n_d, vocab = set(w for w in train) ) self.n_V = embedding_layer.n_V say("Vocab size: {}\tHidden dim: {}\n".format( self.n_V, self.n_d )) activation = get_activation_by_name(args["activation"]) rnn_layer = LSTM( n_in = self.n_d, n_out = self.n_d, activation = activation ) output_layer = Layer( n_in = self.n_d, n_out = self.n_V, activation = T.nnet.softmax, ) # (len*batch) * n_d x_flat = embedding_layer.forward(self.idxs.ravel()) # len * batch * n_d x = apply_dropout(x_flat, self.dropout) x = x.reshape( (self.idxs.shape[0], self.idxs.shape[1], self.n_d) ) # len * batch * (n_d+n_d) h = rnn_layer.forward_all(x, self.init_state, return_c=True) self.last_state = h[-1] h = h[:,:,self.n_d:] h = apply_dropout(h, self.dropout) self.p_y_given_x = output_layer.forward(h.reshape(x_flat.shape)) idys = self.idys.ravel() self.nll = -T.log(self.p_y_given_x[T.arange(idys.shape[0]), idys]) #self.nll = T.nnet.categorical_crossentropy( # self.p_y_given_x, # idys # ) self.layers = [ embedding_layer, rnn_layer, output_layer ] #self.params = [ x_flat ] + rnn_layer.params + output_layer.params self.params = embedding_layer.params + rnn_layer.params + output_layer.params self.num_params = sum(len(x.get_value(borrow=True).ravel()) for l in self.layers for x in l.params) say("# of params in total: {}\n".format(self.num_params))
def build( self, dropout, ortho_char_input_dim, # Should be inferred from the input ortho_char_dim, ortho_char_lstm_dim, char_bidirect, word_vec_input_dim, # Should be inferred from the input wvecs word_dim, # The vector size after projection of the input vector word_lstm_dim, word_bidirect, lr_method, crf, use_type_sparse_feats, type_sparse_feats_input_dim, # Can be inferred from the output of the feature extractors type_sparse_feats_proj_dim, # This is a hyper-parameter use_token_sparse_feats, token_sparse_feats_input_dim, # Can be inferred from the output of the feature extractors # token_sparse_feats_proj_dim, # This is a hyper-parameter use_ortho_attention, use_phono_attention, # use_convolution, phono_char_input_dim, # Can be inferred phono_char_dim, phono_char_lstm_dim, training=True, **kwargs): """ Build the network. """ assert word_dim or phono_char_dim or ortho_char_dim, "No input selected while building the network!" # Training parameters n_tags = len(self.id_to_tag) # Network variables is_train = T.iscalar('is_train') word_vecs = T.dmatrix( name="word_vecs") # A vector for each word in the sentence # => matrix: (len_sent, w_emb_dim) ortho_char_for_vecs = T.dtensor3( name="ortho_char_for_vecs" ) # For each char of each word in the sentence, a char vector # ortho_char_for_vecs = T.ftensor3(name="ortho_char_for_vecs") # => tensor of form: (len_sent, max_wchar_len, char_emb_dim) ortho_char_rev_vecs = T.dtensor3(name="ortho_char_rev_vecs") # ortho_char_rev_vecs = T.ftensor3(name="ortho_char_rev_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_wchar_len, char_emb_dim) phono_char_for_vecs = T.dtensor3(name="phono_char_for_vecs") # phono_char_for_vecs = T.ftensor3(name="phono_char_for_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_ortho_char_len, char_emb_dim) phono_char_rev_vecs = T.dtensor3(name="phono_char_rev_vecs") # phono_char_rev_vecs = T.ftensor3(name="phono_char_rev_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_phono_char_len, char_emb_dim) ortho_char_pos_ids = T.ivector(name='ortho_char_pos_ids') # The word len for each word in the sentence => vect of form: (len_sent,) phono_char_pos_ids = T.ivector(name='phono_char_pos_ids') # The word len for each word in the sentence => vect of form: (len_sent,) type_sparse_feats = T.imatrix(name="type_sparse_feats") # Type sparse features are appended to the input to the word lstm # For each word, a vector of type level sparse feats => mat of form: (len_sent, type_sparse_dim) token_sparse_feats = T.imatrix(name="token_sparse_feats") # Token sparse features are appended to the pre-crf layer # For each word, a vector of token level sparse feats => mat of form: (len_sent, token_sparse_dim) tag_ids = T.ivector(name='tag_ids') # The tag id for each word in the sentence => vect of form: (len_sent,) # Sentence length s_len = (word_vecs if word_dim else ortho_char_pos_ids if ortho_char_dim else phono_char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = HiddenLayer(word_vec_input_dim, word_dim, activation="tanh", name="word_emb_proj") # TO DO : Try not using the bias term in the hidden layer word_input = word_layer.link(word_vecs) inputs.append(word_input) # # Chars inputs # if ortho_char_dim: input_dim += ortho_char_lstm_dim ortho_char_layer = HiddenLayer(ortho_char_input_dim, ortho_char_dim, activation="tanh", name="ortho_char_emb_proj") # TO DO : Try not using bias in the hidden layer ortho_char_lstm_for = LSTM(ortho_char_dim, ortho_char_lstm_dim, with_batch=True, name='ortho_char_lstm_for') ortho_char_lstm_rev = LSTM(ortho_char_dim, ortho_char_lstm_dim, with_batch=True, name='ortho_char_lstm_rev') ortho_char_lstm_for.link( ortho_char_layer.link(ortho_char_for_vecs)) ortho_char_lstm_rev.link( ortho_char_layer.link(ortho_char_rev_vecs)) ortho_char_for_output = ortho_char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids] ortho_char_rev_output = ortho_char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids] inputs.append(ortho_char_for_output) if char_bidirect: inputs.append(ortho_char_rev_output) input_dim += ortho_char_lstm_dim if phono_char_dim: input_dim += phono_char_lstm_dim phono_char_layer = HiddenLayer(phono_char_input_dim, phono_char_dim, activation="tanh", name="phono_char_emb_proj") # TO DO : Try not using bias in the hidden layer phono_char_lstm_for = LSTM(phono_char_dim, phono_char_lstm_dim, with_batch=True, name='phono_char_lstm_for') phono_char_lstm_rev = LSTM(phono_char_dim, phono_char_lstm_dim, with_batch=True, name='phono_char_lstm_rev') phono_char_lstm_for.link( phono_char_layer.link(phono_char_for_vecs)) phono_char_lstm_rev.link( phono_char_layer.link(phono_char_rev_vecs)) phono_char_for_output = phono_char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), phono_char_pos_ids] phono_char_rev_output = phono_char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), phono_char_pos_ids] inputs.append(phono_char_for_output) if char_bidirect: inputs.append(phono_char_rev_output) input_dim += phono_char_lstm_dim # Type level sparse feats # if use_type_sparse_feats: input_dim += type_sparse_feats_input_dim type_level_sparse_layer = HiddenLayer( type_sparse_feats_input_dim, type_sparse_feats_proj_dim, activation="tanh", name='type_level_sparse_layer') # TO DO : Try not using the hidden layer here inputs.append(type_level_sparse_layer.link(type_sparse_feats)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # TO DO : If using type sparse features, then apply hidden layer after concatenating all inputs else: inputs = inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs """ Drop out involves sampling a vector of bernoulli random variables with a parameter 1-p and using it as a mask So, the expected value of the dropped out input is p * (0*x) + (1-p) * (1*x) = (1-p) * x. Since biases will on average respond to the expected input value, at test time we multiply test inputs (1-p) to supply the expected test input instead. """ inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] lstm_outputs = [word_for_output] post_word_lstm_output_size = word_lstm_dim if use_token_sparse_feats: # token_level_sparse_layer = HiddenLayer(token_sparse_feats_input_dim, token_sparse_feats_proj_dim, # activation="tanh", # name='token_level_sparse_layer') # # TO DO : Try not using the hidden layer here # lstm_outputs.append(token_level_sparse_layer.link(token_sparse_feats)) # post_word_lstm_output_size += token_sparse_feats_proj_dim lstm_outputs.append(token_sparse_feats) post_word_lstm_output_size += token_sparse_feats_input_dim if word_bidirect: lstm_outputs.append(word_rev_output) post_word_lstm_output_size += word_lstm_dim if len(lstm_outputs) > 1: final_output = T.concatenate(lstm_outputs, axis=1) tanh_layer = HiddenLayer(post_word_lstm_output_size, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output final_pre_crf_input_size = word_lstm_dim attention_vectors = [] attention_vector_size = 0 if use_ortho_attention and ortho_char_dim: # final_ortho_attention_input_layer = HiddenLayer(post_word_lstm_output_size, ortho_char_lstm_dim, # name='final_ortho_attention_input_layer', activation='tanh') final_ortho_attention_input_layer = HiddenLayer( word_lstm_dim, ortho_char_lstm_dim, name='final_ortho_attention_input_layer', activation='tanh') final_ortho_attention_input = final_ortho_attention_input_layer.link( final_output) # Evaluating attentional vector using a linear projection from final_output since the attention vector # must be conditioned on it and dimension must match the char lstm hidden dim. ortho_for_attention = self.get_TDAttention_vector( final_ortho_attention_input, ortho_char_lstm_for.h.dimshuffle((1, 0, 2)), ortho_char_pos_ids) if char_bidirect: ortho_rev_attention = self.get_TDAttention_vector( final_ortho_attention_input, ortho_char_lstm_rev.h.dimshuffle((1, 0, 2)), ortho_char_pos_ids) attention_vectors.append(ortho_rev_attention) attention_vector_size += ortho_char_lstm_dim attention_vectors.append(ortho_for_attention) attention_vector_size += ortho_char_lstm_dim if use_phono_attention and phono_char_dim: # final_phono_attention_input_layer = HiddenLayer(post_word_lstm_output_size, phono_char_lstm_dim, # name='final_phono_attention_input_layer', activation='tanh') final_phono_attention_input_layer = HiddenLayer( word_lstm_dim, phono_char_lstm_dim, name='final_phono_attention_input_layer', activation='tanh') # Evaluating attentional vector using a linear projection from final_output since the attention vector # must be conditioned on it and dimension must match the char lstm hidden dim. final_phono_attention_input = final_phono_attention_input_layer.link( final_output) phono_for_attention = self.get_TDAttention_vector( final_phono_attention_input, phono_char_lstm_for.h.dimshuffle((1, 0, 2)), phono_char_pos_ids) if char_bidirect: phono_rev_attention = self.get_TDAttention_vector( final_phono_attention_input, phono_char_lstm_rev.h.dimshuffle((1, 0, 2)), phono_char_pos_ids) attention_vectors.append(phono_rev_attention) attention_vector_size += phono_char_lstm_dim attention_vectors.append(phono_for_attention) attention_vector_size += phono_char_lstm_dim if len(attention_vectors) > 1: attention_vectors = T.concatenate(attention_vectors, axis=1) if use_phono_attention or use_ortho_attention: final_output = T.concatenate([final_output, attention_vectors], axis=1) post_word_lstm_output_size += attention_vector_size final_pre_crf_input_size += attention_vector_size # Sentence to Named Entity tags - Score final_layer = HiddenLayer(final_pre_crf_input_size, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') # n_tags + 2 to accommodate start and end symbols small = -1000 # = -log(inf) b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) # Score of starting at start symbol is 1 => -log(1) = 0. Score of start symbol emitting any other NER # tag is -log(inf) = small e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) # Score of ending at end symbol is 1 => -log(1) = 0. Score of end symbol emitting any other NER # tag is -log(inf) = small observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) # observations is the emission energy (-log potential) between each token and each tag. # Emission score of intermediate words towards start and end tags is -log(inf) observations = T.concatenate([b_s, observations, e_s], axis=0) # observations now contains the emission energies for start token, sentence tokens and end token # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Sum of energies associated with the gold tags # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() # Transition scores from label_i to label_{i+1} all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if ortho_char_dim: self.add_component(ortho_char_layer) self.add_component(ortho_char_lstm_for) params.extend(ortho_char_layer.params) params.extend(ortho_char_lstm_for.params) if char_bidirect: self.add_component(ortho_char_lstm_rev) params.extend(ortho_char_lstm_rev.params) if phono_char_dim: self.add_component(phono_char_layer) self.add_component(phono_char_lstm_for) params.extend(phono_char_layer.params) params.extend(phono_char_lstm_for.params) if char_bidirect: self.add_component(phono_char_lstm_rev) params.extend(phono_char_lstm_rev.params) if use_type_sparse_feats: self.add_component(type_level_sparse_layer) params.extend(type_level_sparse_layer.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if word_bidirect or len(lstm_outputs) > 1: self.add_component(tanh_layer) params.extend(tanh_layer.params) if use_ortho_attention and ortho_char_dim: self.add_component(final_ortho_attention_input_layer) params.extend(final_ortho_attention_input_layer.params) if use_phono_attention and phono_char_dim: self.add_component(final_phono_attention_input_layer) params.extend(final_phono_attention_input_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) # Prepare train and eval inputs eval_inputs = [] if word_dim: # eval_inputs.append(word_ids) eval_inputs.append(word_vecs) if ortho_char_dim: # eval_inputs.append(char_for_ids) eval_inputs.append(ortho_char_for_vecs) if char_bidirect: # eval_inputs.append(char_rev_ids) eval_inputs.append(ortho_char_rev_vecs) eval_inputs.append(ortho_char_pos_ids) if phono_char_dim: # eval_inputs.append(char_for_ids) eval_inputs.append(phono_char_for_vecs) if char_bidirect: # eval_inputs.append(char_rev_ids) eval_inputs.append(phono_char_rev_vecs) eval_inputs.append(phono_char_pos_ids) if use_type_sparse_feats: eval_inputs.append(type_sparse_feats) if use_token_sparse_feats: eval_inputs.append(token_sparse_feats) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) print("Finished Compiling") return f_train, f_eval
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM(n_in=n_e, n_out=n_d, activation=activation) layers.append(l) # len * batch masks = T.cast(T.neq(x, padding_id), theano.config.floatX) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = ZLayer( n_in=size, n_hidden=args.hidden_dimension2, activation=activation) # sample z given text (i.e. x) z_pred, sample_updates = output_layer.sample_all(h_final) # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) self.sample_updates = sample_updates print "z_pred", z_pred.ndim probs = output_layer.forward_all(h_final, z_pred) print "probs", probs.ndim logpz = -T.nnet.binary_crossentropy(probs, z_pred) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def ready(self): args = self.args w_emb_layer = self.w_emb_layer c_emb_layer = self.c_emb_layer r_emb_layers = self.r_emb_layers r_matrix_layers = self.r_matrix_layers char_dim = self.char_dim = args.char_dim char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim word_dim = self.word_dim = args.word_dim word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) word_ids = self.word_ids = T.ivector('word_ids') char_ids = self.char_ids = T.imatrix('char_ids') char_lens = self.char_lens = T.fvector('char_lens') char_masks = self.char_masks = T.imatrix('char_masks') up_ids = self.up_ids = T.imatrix('up_ids') up_rels = self.up_rels = T.imatrix('up_rels') up_id_masks = self.up_id_masks = T.imatrix('up_id_masks') down_ids = self.down_ids = T.imatrix('down_ids') down_rels = self.down_rels = T.imatrix('down_rels') down_id_masks = self.down_id_masks = T.imatrix('down_id_masks') tag_ids = self.tag_ids = T.ivector('tag_ids') layers = self.layers = [w_emb_layer, c_emb_layer] layers.extend(r_emb_layers) layers.extend(r_matrix_layers) inputs = self.inputs = [] inputs.append(self.word_ids) inputs.append(self.char_ids) inputs.append(self.char_lens) inputs.append(self.char_masks) inputs.append(self.up_ids) inputs.append(self.up_rels) inputs.append(self.up_id_masks) inputs.append(self.down_ids) inputs.append(self.down_rels) inputs.append(self.down_id_masks) inputs.append(self.tag_ids) wslices = w_emb_layer.forward(word_ids) cslices = c_emb_layer.forward(char_ids.ravel()) cslices = cslices.reshape((char_ids.shape[0], char_ids.shape[1], char_dim)) cslices = cslices.dimshuffle(1, 0, 2) bv_ur_slicess = [] bv_dr_slicess = [] b_ur_slicess = [] b_dr_slicess = [] bv_ur_matrixss = [] bv_dr_matrixss = [] b_ur_matrixss = [] b_dr_matrixss = [] for r_matrix_layer in r_matrix_layers: bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel()) bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel()) b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel()) b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel()) bv_ur_matrixss.append(bv_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) bv_dr_matrixss.append(bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) b_ur_matrixss.append(b_ur_matrixs.reshape((up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) b_dr_matrixss.append(b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) for r_emb_layer in r_emb_layers: bv_ur_slices = r_emb_layer.forward(up_rels.ravel()) bv_dr_slices = r_emb_layer.forward(down_rels.ravel()) b_ur_slices = r_emb_layer.forward2(up_rels.ravel()) b_dr_slices = r_emb_layer.forward2(down_rels.ravel()) bv_ur_slicess.append(bv_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) bv_dr_slicess.append(bv_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) b_ur_slicess.append(b_ur_slices.reshape((up_rels.shape[0], up_rels.shape[1], word_dim))) b_dr_slicess.append(b_dr_slices.reshape((down_rels.shape[0], down_rels.shape[1], word_dim))) char_masks = char_masks.dimshuffle(1, 0) prev_output = wslices prev_size = word_dim if char_dim: layers.append(LSTM( n_in = char_dim, n_out = char_lstm_dim, direction = 'bi' if args.char_bidirect else 'si' )) prev_output_2 = cslices prev_output_2 = apply_dropout(prev_output_2, dropout, v2 = True) prev_output_2 = layers[-1].forward_all(cslices, char_masks) prev_output_2 = T.sum(prev_output_2, axis = 0) prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x') prev_size += char_lstm_dim prev_output = T.concatenate([prev_output, prev_output_2], axis = 1) prev_output = apply_dropout(prev_output, dropout) if args.conv != 0: for i in range(args.clayer): layers.append(GKNNMultiHeadGate( n_in = prev_size, n_out = prev_size, n_head = args.head )) prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[0], down_ids, down_id_masks, bv_dr_slicess[0]) prev_output = apply_dropout(prev_output, dropout) #prev_size *= 2 #layers.append(LSTM( # n_in = prev_size, # n_out = word_lstm_dim, # direction = 'bi' if args.word_bidirect else 'si' #)) #prev_output = prev_output.dimshuffle(0, 'x', 1) #prev_output = layers[-1].forward_all(prev_output) #prev_output = prev_output.reshape((prev_output.shape[0], prev_output.shape[-1])) #prev_size = word_lstm_dim layers.append(Layer( n_in = prev_size, n_out = args.classes, activation = linear, #ReLU, has_bias = False )) n_tags = args.classes s_len = char_ids.shape[0] tags_scores = layers[-1].forward(prev_output) transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) pre_ids = T.arange(s_len + 1) s_ids = T.arange(s_len + 1) + 1 real_path_score += transitions[ padded_tags_ids[pre_ids], padded_tags_ids[s_ids] ].sum() all_paths_scores = CRFForward(observations, transitions) self.nll_loss = nll_loss = - (real_path_score - all_paths_scores) preds = CRFForward(observations, transitions, viterbi = True, return_alpha = False, return_best_sequence=True) self.pred = preds[1:-1] self.l2_sqr = None params = self.params = [transitions] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) #for l, i in zip(layers[3:], range(len(layers[3:]))): for l, i in zip(layers[2+len(r_emb_layers)+len(r_matrix_layers):], range(len(layers[2+len(r_emb_layers)+len(r_matrix_layers):]))): say("layer {}: n_in={}\tn_out={}\n".format( i, l.n_in, l.n_out )) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams)) cost = self.nll_loss + self.l2_sqr lr_method_name = args.learning lr_method_parameters = {} lr_method_parameters['lr'] = args.learning_rate updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs = self.inputs, outputs = [cost, nll_loss], updates = updates, allow_input_downcast = True ) f_eval = theano.function( inputs = self.inputs[:-1], outputs = self.pred, allow_input_downcast = True ) return f_train, f_eval
def build(self, dropout, char_dim, char_hidden_dim, char_bidirect, layer2_hidden_dim, lr_method, layer2, batch_size, pre_emb, use_gaze, crf, training=True, **kwargs): """ Build the network. """ # Training parameters n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Network variables is_train = T.iscalar('is_train') # declare variable,声明整型变量is_train char_ids = T.ivector(name='char_ids') #声明整型一维向量 if use_gaze: gaze = T.imatrix(name='gaze') #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵 # tag_ids = T.imatrix(name='tag_ids') tag_ids = T.ivector(name='tag_ids') # Sentence length s_len = char_ids.shape[0] #每个句子中的字数 # Final input (all word features) # # Char inputs # if char_dim: char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_input = char_layer.link(char_ids) # Initialize with pretrained embeddings if pre_emb and training: new_weights = char_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate( codecs.open(pre_emb, 'r', 'utf-8', 'ignore')): line = line.rstrip().split() if len(line) == char_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_chars): char = self.id_to_char[i] if char in pretrained: new_weights[i] = pretrained[char] c_found += 1 elif char.lower() in pretrained: new_weights[i] = pretrained[char.lower()] c_lower += 1 elif re.sub('\d', '0', char) in pretrained: new_weights[i] = pretrained[re.sub('\d', '0', char)] c_zeros += 1 char_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) chars have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_chars, 100. * (c_found + c_lower + c_zeros) / n_chars) print('%i found directly, %i after lower, %i after zero.') % ( c_found, c_lower, c_zeros) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(char_input) input_test = (1 - dropout) * char_input char_input = T.switch(T.neq(is_train, 0), input_train, input_test) # 条件句 # LSTM for chars, first layer char_lstm_for1 = LSTM(char_dim, char_hidden_dim, with_batch=False, name='first_char_lstm_for') char_lstm_rev1 = LSTM(char_dim, char_hidden_dim, with_batch=False, name='first_char_lstm_rev') char_lstm_for1.link(char_input) # char的顺序: l i k e char_lstm_rev1.link(char_input[::-1, :]) # 单词的顺序: e k i l char_for_output1 = char_lstm_for1.h char_rev_output1 = char_lstm_rev1.h[::-1, :] if char_bidirect: final_output = T.concatenate([char_for_output1, char_rev_output1], axis=1) tanh_layer1 = HiddenLayer(2 * char_hidden_dim, char_hidden_dim, name='tanh_layer1', activation='tanh') final_output = tanh_layer1.link(final_output) else: final_output = char_for_output1 if layer2: # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(final_output) input_test = (1 - dropout) * final_output final_output = T.switch(T.neq(is_train, 0), input_train, input_test) # 条件句 # LSTM for chars, second layer char_lstm_for2 = LSTM(char_hidden_dim, layer2_hidden_dim, with_batch=False, name='second_char_lstm_for') char_lstm_rev2 = LSTM(char_hidden_dim, layer2_hidden_dim, with_batch=False, name='second_char_lstm_rev') char_lstm_for2.link(final_output) char_lstm_rev2.link(final_output[::-1, :]) char_for_output2 = char_lstm_for2.h char_rev_output2 = char_lstm_rev2.h[::-1, :] if char_bidirect: final_output = T.concatenate( [char_for_output2, char_rev_output2], axis=1) tanh_layer2 = HiddenLayer(2 * layer2_hidden_dim, layer2_hidden_dim, name='tanh_layer2', activation='tanh') final_output = tanh_layer2.link(final_output) else: final_output = char_for_output2 if layer2: dims = layer2_hidden_dim else: dims = char_hidden_dim if use_gaze: final_output = T.concatenate([final_output, gaze], axis=1) dims = dims + n_tags # final_output = T.reshape(final_output, (-1, input_dim)) # Sentence to Named Entity tags - Score,ci与CRF之间的隐含层 final_layer = HiddenLayer(dims, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # P中对应元素的求和好 # Score from add_componentnsitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() # A中对应元素的求和 all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if char_dim: self.add_component(char_layer) params.extend(char_layer.params) self.add_component(char_lstm_for1) params.extend(char_lstm_for1.params) if char_bidirect: self.add_component(char_lstm_rev1) params.extend(char_lstm_rev1.params) self.add_component(tanh_layer1) params.extend(tanh_layer1.params) if layer2: self.add_component(char_lstm_for2) params.extend(char_lstm_for2.params) if char_bidirect: self.add_component(char_lstm_rev2) params.extend(char_lstm_rev2.params) self.add_component(tanh_layer2) params.extend(tanh_layer2.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) # Prepare train and eval inputs eval_inputs = [] if char_dim: eval_inputs.append(char_ids) if use_gaze: eval_inputs.append(gaze) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def ready(self): encoder = self.encoder embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = encoder.dropout # len*batch x = self.x = encoder.x z = self.z = encoder.z n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in range(2): if layer_type == "rcnn": l = RCNN( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM( n_in=n_e, # if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 output_layer = self.output_layer = Layer(n_in=size, n_out=1, activation=sigmoid) # len*batch*1 probs = output_layer.forward(h_final) # len*batch probs2 = probs.reshape(x.shape) self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast( self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # self.z_pred = theano.gradient.disconnected_grad(z_pred) z2 = z.dimshuffle((0, 1, "x")) logpz = -T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch zsum = T.sum(z, axis=0, dtype=theano.config.floatX) zdiff_pre = (z[1:] - z[:-1]) * 1.0 zdiff = T.sum(abs(zdiff_pre), axis=0, dtype=theano.config.floatX) loss_mat = encoder.loss_mat if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:, args.aspect] self.loss_vec = loss_vec coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg cost = self.cost = cost_logpz * 10 + l2_cost print("cost.dtype", cost.dtype) self.cost_e = loss * 10 + encoder.l2_cost
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') cap_ids = T.ivector(name='cap_ids') # Sentence length # Final input (all word features) input_dim = 0 inputs = [] s_len = (char_pos_ids).shape[0] # # # Chars inputs # input_dim += (char_lstm_dim * 2) char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=False, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=False, name='char_lstm_rev') char_lstm_for.link(char_layer.link(word_ids)) char_lstm_rev.link(char_layer.link(cap_ids)) final_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_layer', activation=('softmax')) chars_final = final_layer.link(char_lstm_for.h) final_rev_layer = HiddenLayer(char_lstm_dim, n_chars, name='final_char_rev_layer', activation=('softmax')) chars_rev_final = final_layer.link(char_lstm_rev.h) cost_chars = T.nnet.categorical_crossentropy(chars_final, char_pos_ids).mean() cost_chars_rev = T.nnet.categorical_crossentropy(chars_rev_final, tag_ids).mean() # Network parameters params = [] if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) #if cap_dim: eval_inputs.append(tag_ids) eval_inputs.append(cap_ids) # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Fetch gradients from both char_lstms gradients = T.grad(cost_chars, char_lstm_for.params) gradients_rev = T.grad(cost_chars_rev, char_lstm_rev.params) # Return forward char_lstm grads f_eval = theano.function( inputs=eval_inputs, outputs=gradients, givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='ignore' ) # Return reverse char_lstm grads f_eval_rev = theano.function( inputs=eval_inputs, outputs=gradients_rev, givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='ignore' ) return f_eval, f_eval_rev
def ready(self): global total_generate_time #say("in generator ready: \n") #start_generate_time = time.time() embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN(n_in=n_e, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = LSTM(n_in=n_e, n_out=n_d, activation=activation) l = Layer(n_in=n_e, n_out=n_d, activation=sigmoid) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), theano.config.floatX).dimshuffle( (0, 1, "x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward(embs) h2 = layers[1].forward(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 #size = n_e output_layer = self.output_layer = Layer(n_in=size, n_out=1, activation=sigmoid) # len*batch*1 probs = output_layer.forward(h_final) #probs = output_layer.forward(embs) #probs1 = probs.reshape(x.shape) #probs_rev = output_layer.forward(flipped_embs) #probs1_rev = probs.reshape(x.shape) #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2) # len*batch probs2 = probs.reshape(x.shape) if self.args.seed is not None: self.MRG_rng = MRG_RandomStreams(self.args.seed) else: self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast( self.MRG_rng.binomial(size=probs2.shape, p=probs2), theano.config.floatX) #"int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) #self.sample_updates = sample_updates print "z_pred", z_pred.ndim z2 = z_pred.dimshuffle((0, 1, "x")) logpz = -T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def build( self, dropout, char_dim, char_hidden_dim, char_bidirect, word_dim, word_hidden_dim, word_bidirect, tagger_hidden_dim, hamming_cost, L2_reg, lr_method, pre_word_emb, pre_char_emb, tagger, use_gaze, POS, plot_cost, #cap_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # n_pos = len(self.id_to_pos) + 1 # Number of capitalization features #if cap_dim: # n_cap = 4 # Network variables is_train = T.iscalar('is_train') # declare variable,声明整型变量is_train word_ids = T.ivector(name='word_ids') #声明整型一维向量 char_for_ids = T.imatrix(name='char_for_ids') # 声明整型二维矩阵 char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') if use_gaze: gaze = T.imatrix(name='gaze') if POS: # pos_ids = T.ivector(name='pos_ids') pos_one_hot = T.imatrix(name='pos_one_hot') #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵 tag_ids = T.ivector(name='tag_ids') #if cap_dim: # cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] #句子中的单词数 # Final input (all word features) input_dim = 0 inputs = [] L2_norm = 0.0 theano.config.compute_test_value = 'off' # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_word_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained word embeddings from %s...' % pre_word_emb pretrained = {} emb_invalid = 0 for i, line in enumerate( codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid word embedding lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word) in pretrained: new_weights[i] = pretrained[re.sub('\d', '0', word)] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained word embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained word embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print('%i found directly, %i after lowercasing + zero.') % ( c_found, c_lower + c_zeros) L2_norm += (word_layer.embeddings**2).sum() # # Chars inputs # if char_dim: input_dim += char_hidden_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_for_input = char_layer.link(char_for_ids) char_rev_input = char_layer.link(char_rev_ids) # Initialize with pretrained char embeddings if pre_char_emb and training: new_weights = char_layer.embeddings.get_value() print 'Loading pretrained char embeddings from %s...' % pre_char_emb pretrained = {} emb_invalid = 0 for i, line in enumerate( codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')): line = line.rstrip().split() if len(line) == char_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid char embedding lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_chars): char = self.id_to_char[i] if char in pretrained: new_weights[i] = pretrained[char] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', char) in pretrained: new_weights[i] = pretrained[re.sub('\d', '0', char)] c_zeros += 1 char_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained char embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained char embeddings.') % ( c_found + c_lower + c_zeros, n_chars, 100. * (c_found + +c_lower + c_zeros) / n_chars) print('%i found directly, %i after lowercasing + zero.') % ( c_found, c_lower + c_zeros) L2_norm += (char_layer.embeddings**2).sum() char_lstm_for = LSTM(char_dim, char_hidden_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_hidden_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_for_input) char_lstm_rev.link(char_rev_input) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] for param in char_lstm_for.params[:8]: L2_norm += (param**2).sum() if char_bidirect: char_lstm_hidden = T.concatenate( [char_for_output, char_rev_output], axis=1) input_dim += char_hidden_dim for param in char_lstm_rev.params[:8]: L2_norm += (param**2).sum() else: char_lstm_hidden = char_for_output inputs.append(char_lstm_hidden) # if POS: # pos_dim = 20 # input_dim += pos_dim # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer') # pos_input = pos_layer.link(pos_ids) # inputs.append(pos_input) # L2_norm += (pos_layer.embeddings ** 2).sum() #if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # 条件句 # if POS: # inputs = T.concatenate([inputs, pos_one_hot], axis= 1) # input_dim += 6 # LSTM for words word_lstm_for = LSTM(input_dim, word_hidden_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_hidden_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) # 单词的顺序: I like dog word_lstm_rev.link(inputs[::-1, :]) # 单词的顺序: dog like I word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] for param in word_lstm_for.params[:8]: L2_norm += (param**2).sum() if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) for param in word_lstm_rev.params[:8]: L2_norm += (param**2).sum() else: final_output = word_for_output dims = word_hidden_dim if use_gaze: final_output = T.concatenate([final_output, gaze], axis=1) dims = word_hidden_dim + n_tags if POS: final_output = T.concatenate([final_output, pos_one_hot], axis=1) dims += 6 # if word_bidirect: # final_output = T.concatenate( # [word_for_output, word_rev_output], # axis=1 # ) # tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim, # name='tanh_layer', activation='tanh') # final_output = tanh_layer.link(final_output) # else: # final_output = word_for_output # Sentence to Named Entity tags ## final_layer = HiddenLayer(dims, n_tags, name='final_layer', ## activation=(None if crf else 'softmax')) # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer', # activation=(None if crf else 'softmax')) ## tags_scores = final_layer.link(final_output) ## L2_norm += (final_layer.params[0] ** 2).sum() # No CRF if tagger == 'lstm': tagger_layer = LSTM_d(dims, tagger_hidden_dim, with_batch=False, name='LSTM_d') tagger_layer.link(final_output) final_output = tagger_layer.t dims = tagger_hidden_dim for param in tagger_layer.params[:8]: L2_norm += (param**2).sum() final_layer = HiddenLayer( dims, n_tags, name='final_layer', activation=(None if tagger == 'crf' else 'softmax')) tags_scores = final_layer.link(final_output) L2_norm += (final_layer.params[0]**2).sum() if tagger != 'crf': cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # P中对应元素的求和好 # Score from add_componentnsitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() # A中对应元素的求和 all_paths_scores = forward(observations, transitions, hamming_cost=hamming_cost, n_tags=n_tags, padded_tags_ids=padded_tags_ids) L2_norm += (transitions**2).sum() cost = -(real_path_score - all_paths_scores) + L2_reg * L2_norm # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) params.extend(char_layer.params) self.add_component(char_lstm_for) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) # if POS: # self.add_component(pos_layer) # params.extend(pos_layer.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) self.add_component(final_layer) params.extend(final_layer.params) if tagger == 'lstm': self.add_component(tagger_layer) params.extend(tagger_layer.params) elif tagger == 'crf': self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if use_gaze: eval_inputs.append(gaze) if POS: # eval_inputs.append(pos_ids) eval_inputs.append(pos_one_hot) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) #if cap_dim: # eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {}), on_unused_input='warn') else: f_train = None if plot_cost: f_plot_cost = theano.function(inputs=train_inputs, outputs=cost, givens=({ is_train: np.cast['int32'](1) } if dropout else {}), on_unused_input='warn') else: f_plot_cost = None # Compile evaluation function if tagger != 'crf': f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {}), on_unused_input='warn') else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, hamming_cost=0, n_tags=None, padded_tags_ids=None, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {}), on_unused_input='warn') return f_train, f_eval, f_plot_cost
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, pre_voc, crf, pos_dim, n_pos, training = 1, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_y) n_cap = 2 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') cap_ids = T.ivector(name='cap_ids') if pos_dim: pos_ids = T.ivector(name='pos_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 emb_matrix = np.load(pre_emb) pre_w2idxs = dict([(w,i) for i,w in enumerate(np.load(pre_voc))]) print pre_w2idxs.items()[:10] assert emb_matrix[0].shape[0] == word_dim for w in pre_w2idxs: pretrained[w.lower()] = np.array( [float(x) for x in emb_matrix[pre_w2idxs[w]]]).astype(np.float32) if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros ) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Cue feature # input_dim += word_dim cap_layer = EmbeddingLayer(n_cap, word_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # # POS feature # if pos_dim: input_dim += word_dim pos_layer = EmbeddingLayer(n_pos, word_dim, name="pos_layer") inputs.append(pos_layer.link(pos_ids)) # Prepare final input # if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() all_paths_scores = forward(observations, transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) # Add cue layer (cap for the moment) self.add_component(cap_layer) params.extend(cap_layer.params) # Add pos tag layer if pos_dim: self.add_component(pos_layer) params.extend(pos_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) # add cue vector to the inputs eval_inputs.append(cap_ids) # add pos vector to the inputs if pos_dim: eval_inputs.append(pos_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) return f_train, f_eval
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: # Randomly generates new weights new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb # Here is where we will substitute pyemblib read function. # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab) emb_format = pyemblib2.Format.Word2Vec pretrained = get_embedding_dict(pre_emb, emb_format, 0, None) ''' pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid ''' c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 # This is it, this is what needs to be printed. # "word_layer.embeddings" is a "theano.shared" object word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros ) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() all_paths_scores = forward(observations, transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) # Supposedly the commented-out line below will stop # the model from updating the pretrained emeddings. # params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: # "params" supposedly contains the pretrained embedding matrix that we are updating. # Find the "get_updates" function and figure out what it does. updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) #======================================== # FUNCTION TO PRINT PRETRAINED EMBEDDINGS # The function below takes one argument, which it prints # along with the specified print message. print_matrix = T.dmatrix() print_op = printing.Print('print message') printed_x = print_op(print_matrix) f_print = function([print_matrix], printed_x) #======================================== else: f_train = None f_print = None # We return a tuple of things used to print the embedding so that it looks nicer. print_tuple = [f_print, word_layer.embeddings] # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) return f_train, f_eval, print_tuple