def __build_internal_node(self, c1_node, c2_node): """Build a new internal node which represents the representation of c1_node.p and c2_node.p computed using autoencoder Args: c1_node: left node c2_node: right node Returns: value1: a new internal node value2: reconstruction error, a scalar """ c1 = c1_node.p c2 = c2_node.p p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2) + self.bi) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p) + self.bo1) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p) + self.bo2) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 node = InternalNode(-1, c1_node, c2_node, p, p_unnormalized, y1c1, y2c2, y1_unnormalized, y2_unnormalized) reconstruction_error = sum_along_column(y1c1 ** 2) + sum_along_column(y2c2 ** 2) reconstruction_error = 0.5 * reconstruction_error[0] return node, reconstruction_error
def __build_internal_node(self, c1_node, c2_node): '''Build a new internal node which represents the representation of c1_node.p and c2_node.p computed using autoencoder Args: c1_node: left node c2_node: right node Returns: value1: a new internal node value2: reconstruction error, a scalar ''' c1 = c1_node.p c2 = c2_node.p p_unnormalized = self.f( dot(self.Wi1, c1) + dot(self.Wi2, c2) + self.bi) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p) + self.bo1) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p) + self.bo2) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 node = InternalNode(-1, c1_node, c2_node, p, p_unnormalized, y1c1, y2c2, y1_unnormalized, y2_unnormalized) reconstruction_error = sum_along_column(y1c1**2) + sum_along_column( y2c2**2) reconstruction_error = 0.5 * reconstruction_error[0] return node, reconstruction_error
def process_la( src_rae_la, trg_rae_la, alpha, src_word_vectors, src_instances, src_total_internal_node, trg_word_vectors, trg_instances, trg_total_internal_node, bad_src_instances, bad_trg_instances ): total_rec_error = 0 total_sem_error = 0 # 初始化梯度参数 src_gradients_la = src_rae_la.get_zero_gradients_la() trg_gradients_la = trg_rae_la.get_zero_gradients_la() src_total_rec_error = 0 trg_total_rec_error = 0 src_total_sem_error = 0 trg_total_sem_error = 0 for i in xrange( len( src_instances ) ): src_instance = src_instances[i] trg_instance = trg_instances[i] bad_src_instance = bad_src_instances[i] bad_trg_instance = bad_trg_instances[i] # 取出该短语中所有词向量,instance.words中的单词idx还原成words.embedded中的词向量矩阵n*word_num src_words_embedded = src_word_vectors[src_instance.words] trg_words_embedded = trg_word_vectors[trg_instance.words] bad_src_embedded = src_word_vectors[bad_src_instance] bad_trg_embedded = trg_word_vectors[bad_trg_instance] # 前向传播,计算错误 src_root_node, src_rec_error = src_rae_la.forward_la( src_words_embedded, src_instance ) trg_root_node, trg_rec_error = trg_rae_la.forward_la( trg_words_embedded, trg_instance ) src_total_rec_error += src_rec_error * src_instance.freq trg_total_rec_error += trg_rec_error * trg_instance.freq #bad_src_root, _ = src_rae_la.forward_la( bad_src_embedded, instance ) #bad_trg_root, _ = trg_rae_la.forward_la( bad_trg_embedded, instance ) rec_s = alpha * src_instance.freq / src_total_internal_node rec_t = alpha * trg_instance.freq / trg_total_internal_node sem_s = ( 1 - alpha ) * src_instance.freq / src_total_internal_node sem_t = ( 1 - alpha ) * trg_instance.freq / trg_total_internal_node # Semantic Error # Source side src_yla_unnormalized = tanh( dot( src_rae_la.Wla, src_root_node.p ) + src_rae_la.bla ) src_yla = src_yla_unnormalized / LA.norm( src_yla_unnormalized, axis=0 ) src_ylapla = src_yla - trg_root_node.p src_sem_error = 0.5 * sum_along_column( src_ylapla**2 )[0] bad_src_ylapla = src_yla - bad_trg_root.p bad_src_sem_error = 0.5 * sum_along_column( bad_src_ylapla**2 )[0] src_sem_margin = src_sem_error#(src_sem_error-bad_src_sem_error+1)*src_instance.freq src_sem_margin = max( 0.0, src_sem_margin ) if src_sem_margin == 0.0: soptimal = True else: soptimal = False src_total_sem_error += src_sem_margin # Target side trg_yla_unnormalized = tanh( dot( trg_rae_la.Wla, trg_root_node.p ) + trg_rae_la.bla ) trg_yla = trg_yla_unnormalized / LA.norm( trg_yla_unnormalized, axis=0 ) trg_ylapla = trg_yla - src_root_node.p trg_sem_error = 0.5 * sum_along_column( trg_ylapla**2 )[0] bad_trg_ylapla = trg_yla - bad_src_root.p bad_trg_sem_error = 0.5 * sum_along_column( bad_trg_ylapla**2 )[0] trg_sem_margin = trg_sem_error#(trg_sem_error-bad_trg_sem_error+1)*trg_instance.freq trg_sem_margin = max( 0.0, trg_sem_margin ) if trg_sem_margin == 0.0: toptimal = True else: toptimal = False trg_total_sem_error += trg_sem_margin # 反向传播计算梯度 src_rae_la.backward_la( src_root_node, bad_src_root, src_gradients_la, rec_s, sem_s, src_yla_unnormalized, src_ylapla, 0, soptimal ) #trg_rae_la.backward_la( trg_root_node, bad_trg_root, trg_gradients_la, rec_t, sem_t, #trg_yla_unnormalized, trg_ylapla, bad_trg_ylapla, toptimal ) src_total_rec_error = src_total_rec_error / src_total_internal_node trg_total_rec_error = trg_total_rec_error / trg_total_internal_node src_total_sem_error = src_total_sem_error / src_total_internal_node trg_total_sem_error = trg_total_sem_error / trg_total_internal_node return src_total_rec_error, src_total_sem_error, src_gradients_la.to_row_vector_la(),\ trg_total_rec_error, trg_total_sem_error, trg_gradients_la.to_row_vector_la()
def encode(self, words_embedded): ''' Forward pass of training recursive autoencoders using backpropagation through structures. Args: words_embedded: word embedding vectors (column vectors) Returns: value1: root of the tree, an instance of InternalNode value2: reconstruction_error ''' words_num = words_embedded.shape[1] tree_nodes = [None]*(2*words_num - 1) tree_nodes[0:words_num] = [LeafNode(i, words_embedded[:, (i,)]) for i in range(words_num)] reconstruction_error = 0 # build a tree greedily # initialize reconstruction errors c1 = words_embedded[:, arange(words_num-1)] c2 = words_embedded[:, arange(1, words_num)] p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2) + self.bi[:, zeros(words_num-1, dtype=int)]) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p) + self.bo1[:, zeros(words_num-1, dtype=int)]) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p) + self.bo2[:, zeros(words_num-1, dtype=int)]) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 J = 1/2 * (sum_along_column(y1c1**2) + sum_along_column(y2c2**2)) # initialize candidate internal nodes candidate_nodes = [] for i in range(words_num-1): left_child = tree_nodes[i] right_child = tree_nodes[i+1] node = InternalNode(-i-1, left_child, right_child, p[:, (i,)], p_unnormalized[:, (i,)], y1c1[:, (i,)], y2c2[:, (i,)], y1_unnormalized[:, (i,)], y2_unnormalized[:, (i,)]) candidate_nodes.append(node) debugging_cand_node_index = words_num for j in range(words_num-1): # find the smallest reconstruction error J_minpos = J.argmin() J_min = J[J_minpos] reconstruction_error += J_min node = candidate_nodes[J_minpos] node.index = words_num + j # for dubugging tree_nodes[words_num+j] = node # update reconstruction errors if J_minpos+1 < len(candidate_nodes): c1 = node c2 = candidate_nodes[J_minpos+1].right_child right_cand_node, right_J = self.__build_internal_node(c1, c2) right_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos+1] = right_cand_node J[J_minpos+1] = right_J if J_minpos-1 >= 0: c1 = candidate_nodes[J_minpos-1].left_child c2 = node left_cand_node, left_J = self.__build_internal_node(c1, c2) left_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos-1] = left_cand_node J[J_minpos-1] = left_J valid_indices = [i for i in range(words_num-1-j) if i != J_minpos] J = J[valid_indices] candidate_nodes = [candidate_nodes[k] for k in valid_indices] return tree_nodes, reconstruction_error
def process_la( source_rae_la, target_rae_la, alpha, source_word_vectors, source_instances, source_total_internal_node, target_word_vectors, target_instances, target_total_internal_node, bad_src_instances, bad_trg_instances, ): total_rec_error = 0 total_sem_error = 0 # 初始化梯度参数 source_gradients_la = source_rae_la.get_zero_gradients_la() target_gradients_la = target_rae_la.get_zero_gradients_la() source_total_rec_error = 0 target_total_rec_error = 0 source_total_sem_error = 0 target_total_sem_error = 0 for i in xrange(len(source_instances)): source_instance = source_instances[i] target_instance = target_instances[i] bad_src_instance = bad_src_instances[i] bad_trg_instance = bad_trg_instances[i] # 取出该短语中所有词向量,instance.words中的单词idx还原成words.embedded中的词向量矩阵n*word_num source_words_embedded = source_word_vectors[source_instance.words] target_words_embedded = target_word_vectors[target_instance.words] bad_source_embedded = source_word_vectors[bad_src_instance] bad_target_embedded = target_word_vectors[bad_trg_instance] # print source_words_embedded # print target_words_embedded # 前向传播,计算错误 source_root_node, source_rec_error = source_rae_la.forward_la(source_words_embedded) target_root_node, target_rec_error = target_rae_la.forward_la(target_words_embedded) source_total_rec_error += source_rec_error * source_instance.freq target_total_rec_error += target_rec_error * target_instance.freq bad_source_root, _ = source_rae_la.forward_la(bad_source_embedded) bad_target_root, _ = target_rae_la.forward_la(bad_target_embedded) rec_s = alpha * source_instance.freq / source_total_internal_node rec_t = alpha * target_instance.freq / target_total_internal_node sem_s = (1 - alpha) * source_instance.freq / source_total_internal_node sem_t = (1 - alpha) * target_instance.freq / target_total_internal_node # Semantic Error # Source side source_yla_unnormalized = tanh(dot(source_rae_la.Wla, source_root_node.p) + source_rae_la.bla) source_yla = source_yla_unnormalized / LA.norm(source_yla_unnormalized, axis=0) source_ylapla = source_yla - target_root_node.p source_sem_error = 0.5 * sum_along_column(source_ylapla ** 2)[0] # print source_sem_error bad_source_ylapla = source_yla - bad_target_root.p bad_source_sem_error = 0.5 * sum_along_column(bad_source_ylapla ** 2)[0] source_sem_margin = (source_sem_error - bad_source_sem_error + 1) * source_instance.freq source_sem_margin = max(0.0, source_sem_margin) if source_sem_margin == 0.0: soptimal = True else: soptimal = False source_total_sem_error += source_sem_margin # Target side target_yla_unnormalized = tanh(dot(target_rae_la.Wla, target_root_node.p) + target_rae_la.bla) target_yla = target_yla_unnormalized / LA.norm(target_yla_unnormalized, axis=0) target_ylapla = target_yla - source_root_node.p target_sem_error = 0.5 * sum_along_column(target_ylapla ** 2)[0] bad_target_ylapla = target_yla - bad_source_root.p bad_target_sem_error = 0.5 * sum_along_column(bad_target_ylapla ** 2)[0] target_sem_margin = (target_sem_error - bad_target_sem_error + 1) * target_instance.freq target_sem_margin = max(0.0, target_sem_margin) if target_sem_margin == 0.0: toptimal = True else: toptimal = False target_total_sem_error += target_sem_margin # 反向传播计算梯度 source_rae_la.backward_la( source_root_node, bad_source_root, source_gradients_la, rec_s, sem_s, sem_t, source_yla_unnormalized, source_ylapla, target_ylapla, bad_source_ylapla, bad_target_ylapla, soptimal, toptimal, ) target_rae_la.backward_la( target_root_node, bad_target_root, target_gradients_la, rec_t, sem_t, sem_s, target_yla_unnormalized, target_ylapla, source_ylapla, bad_target_ylapla, bad_source_ylapla, toptimal, soptimal, ) total_rec_error = source_total_rec_error * (1.0 / source_total_internal_node) + target_total_rec_error * ( 1.0 / target_total_internal_node ) total_sem_error = source_total_sem_error * (1.0 / source_total_internal_node) + target_total_sem_error * ( 1.0 / target_total_internal_node ) grad_row_vec = [source_gradients_la.to_row_vector_la(), target_gradients_la.to_row_vector_la()] return total_rec_error, total_sem_error, concatenate(grad_row_vec)
def forward(self, words_embedded): ''' Forward pass of training recursive autoencoders using backpropagation through structures. Args: words_embedded: word embedding vectors (column vectors) Returns: value1: root of the tree, an instance of InternalNode value2: reconstruction_error ''' sent_length = words_embedded.shape[1] tree_node_indices = arange(sent_length) # print "TNI: ",tree_node_indices tree_nodes = [None]*(2*sent_length - 1) # print "TN: ",tree_nodes tree_nodes[0:sent_length] = [LeafNode(i, words_embedded[:, (i,)]) for i in range(sent_length)] # print "TN: ",tree_nodes reconstruction_error = 0 # build a tree greedily for j in range(sent_length-1): words_num = words_embedded.shape[1] c1 = words_embedded[:, arange(words_num-1)] c2 = words_embedded[:, arange(1, words_num)] p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2)\ + self.bi[:, zeros(words_num-1, dtype=int)]) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p)\ + self.bo1[:, zeros(words_num-1, dtype=int)]) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p)\ + self.bo2[:, zeros(words_num-1, dtype=int)]) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 J = 1/2 * (sum_along_column(y1c1**2) + sum_along_column(y2c2**2)) # finding the pair with smallest reconstruction error for constructing tree J_minpos = J.argmin() J_min = J[J_minpos] reconstruction_error += J_min left_child = tree_nodes[tree_node_indices[J_minpos]] right_child = tree_nodes[tree_node_indices[J_minpos+1]] y1_minus_c1 = y1c1[:, (J_minpos,)] y2_minus_c2 = y2c2[:, (J_minpos,)] y1_unnormalized_minpos = y1_unnormalized[:, (J_minpos,)] y2_unnormalized_minpos = y2_unnormalized[:, (J_minpos,)] node = InternalNode(sent_length+j, left_child, right_child, p[:, (J_minpos,)], p_unnormalized[:, (J_minpos,)], y1_minus_c1, y2_minus_c2, y1_unnormalized_minpos, y2_unnormalized_minpos) tree_nodes[sent_length+j] = node valid_indices = [i for i in range(sent_length-j) if i != J_minpos+1] words_embedded = words_embedded[:, valid_indices] words_embedded[:, (J_minpos,)] = p[:, (J_minpos,)] tree_node_indices = tree_node_indices[valid_indices] tree_node_indices[J_minpos] = sent_length + j # return tree_nodes[-1], reconstruction_error return tree_nodes, reconstruction_error
def forward_la(self, words_embedded, instance ): ''' Forward pass of training recursive autoencoders using backpropagation through structures. Args: words_embedded: word embedding vectors (column vectors) Returns: value1: root of the tree, an instance of InternalNode value2: reconstruction_error ''' # 短语中的单词数量 words_num = words_embedded.shape[1] # 共有n+n-1 = 2n-1个节点 tree_nodes = [None]*(2*words_num - 1) # 前n个节点为输入单词的叶节点 tree_nodes[0:words_num] = [LeafNode(instance.words[i], words_embedded[:, (i,)]) for i in range(words_num)] reconstruction_error = 0 # build a tree greedily # initialize reconstruction errors # 前n-1个单词组成的列向量 c1 = words_embedded[:, arange(words_num-1)] # 第1到第n个单词组成的列向量 c2 = words_embedded[:, arange(1, words_num)] # c1,c2用来计算最小error. # 计算所有叶节点邻居对的error,挑选error最小的一组开始 p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2)\ + self.bi[:, zeros(words_num-1, dtype=int)]) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p)\ + self.bo1[:, zeros(words_num-1, dtype=int)]) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p)\ + self.bo2[:, zeros(words_num-1, dtype=int)]) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 J = 1/2 * (sum_along_column(y1c1**2) + sum_along_column(y2c2**2)) # initialize candidate internal nodes candidate_nodes = [] # 构造n-1个由叶节点邻居对构成的非叶节点 for i in range(words_num-1): left_child = tree_nodes[i] right_child = tree_nodes[i+1] # idx : -1到-n + 1,共n-1个非叶节点 node = InternalNode(-i-1, left_child, right_child, p[:, (i,)], p_unnormalized[:, (i,)], y1c1[:, (i,)], y2c2[:, (i,)], y1_unnormalized[:, (i,)], y2_unnormalized[:, (i,)]) candidate_nodes.append(node) debugging_cand_node_index = words_num # 寻找最优非叶节点 for j in range(words_num-1): # find the smallest reconstruction error # 最小error非叶节点的index J_minpos = J.argmin() # 最小error非叶节点值 J_min = J[J_minpos] # 更新总error reconstruction_error += J_min #取出该最小error非叶节点 node = candidate_nodes[J_minpos] node.index = words_num + j # for dubugging # 在最小error生成树中添加进该节点 tree_nodes[words_num+j] = node # update reconstruction errors # 最优节点右侧还有节点,可以同右侧节点组合 if J_minpos+1 < len(candidate_nodes): c1 = node c2 = candidate_nodes[J_minpos+1].right_child right_cand_node, right_J = self.__build_internal_node(c1, c2) # 内节点index从-n开始累减 right_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos+1] = right_cand_node J[J_minpos+1] = right_J #最优内节点不是最左侧的节点 if J_minpos-1 >= 0: c1 = candidate_nodes[J_minpos-1].left_child c2 = node left_cand_node, left_J = self.__build_internal_node(c1, c2) left_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos-1] = left_cand_node J[J_minpos-1] = left_J valid_indices = [i for i in range(words_num-1-j) if i != J_minpos] J = J[valid_indices] candidate_nodes = [candidate_nodes[k] for k in valid_indices] return tree_nodes[-1], reconstruction_error
def forward(self, words_embedded): ''' Forward pass of training recursive autoencoders using backpropagation through structures. Args: words_embedded: word embedding vectors (column vectors) Returns: value1: root of the tree, an instance of InternalNode value2: reconstruction_error ''' words_num = words_embedded.shape[1] tree_nodes = [None] * (2 * words_num - 1) tree_nodes[0:words_num] = [ LeafNode(i, words_embedded[:, (i, )]) for i in range(words_num) ] reconstruction_error = 0 # build a tree greedily # initialize reconstruction errors c1 = words_embedded[:, arange(words_num - 1)] c2 = words_embedded[:, arange(1, words_num)] p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2)\ + self.bi[:, zeros(words_num-1, dtype=int)]) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p)\ + self.bo1[:, zeros(words_num-1, dtype=int)]) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p)\ + self.bo2[:, zeros(words_num-1, dtype=int)]) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 J = 1 / 2 * (sum_along_column(y1c1**2) + sum_along_column(y2c2**2)) # initialize candidate internal nodes candidate_nodes = [] for i in range(words_num - 1): left_child = tree_nodes[i] right_child = tree_nodes[i + 1] node = InternalNode(-i - 1, left_child, right_child, p[:, (i, )], p_unnormalized[:, (i, )], y1c1[:, (i, )], y2c2[:, (i, )], y1_unnormalized[:, (i, )], y2_unnormalized[:, (i, )]) candidate_nodes.append(node) debugging_cand_node_index = words_num # 寻找误差最小的组合,如1、2组合3、4组合,这个误差最小则使用这个 # 每次寻找最小error,然后更新误差值,迭代查找 for j in range(words_num - 1): # find the smallest reconstruction error J_minpos = J.argmin() J_min = J[J_minpos] reconstruction_error += J_min node = candidate_nodes[J_minpos] node.index = words_num + j # for dubugging tree_nodes[words_num + j] = node # update reconstruction errors if J_minpos + 1 < len(candidate_nodes): c1 = node c2 = candidate_nodes[J_minpos + 1].right_child right_cand_node, right_J = self.__build_internal_node(c1, c2) right_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos + 1] = right_cand_node J[J_minpos + 1] = right_J if J_minpos - 1 >= 0: c1 = candidate_nodes[J_minpos - 1].left_child c2 = node left_cand_node, left_J = self.__build_internal_node(c1, c2) left_cand_node.index = -debugging_cand_node_index debugging_cand_node_index += 1 candidate_nodes[J_minpos - 1] = left_cand_node J[J_minpos - 1] = left_J valid_indices = [ i for i in range(words_num - 1 - j) if i != J_minpos ] J = J[valid_indices] candidate_nodes = [candidate_nodes[k] for k in valid_indices] return tree_nodes[-1], reconstruction_error
def test_sum_along_column(self): x = np.array([[1, 2, 3], [4, 5, 6]]) x_sum = sum_along_column(x) x_sum_expected = np.array([5, 7, 9]) self.assertFalse((x_sum != x_sum_expected).any())
def forward(self, words_embedded): ''' Forward pass of training recursive autoencoders using backpropagation through structures. Args: words_embedded: word embedding vectors (column vectors) Returns: value1: root of the tree, an instance of InternalNode value2: reconstruction_error ''' sent_length = words_embedded.shape[1] tree_node_indices = arange(sent_length) tree_nodes = [None] * (2 * sent_length - 1) tree_nodes[0:sent_length] = [ LeafNode(i, words_embedded[:, (i, )]) for i in range(sent_length) ] reconstruction_error = 0 # build a tree greedily for j in range(sent_length - 1): words_num = words_embedded.shape[1] c1 = words_embedded[:, arange(words_num - 1)] c2 = words_embedded[:, arange(1, words_num)] p_unnormalized = self.f(dot(self.Wi1, c1) + dot(self.Wi2, c2) \ + self.bi[:, zeros(words_num - 1, dtype=int)]) p = p_unnormalized / LA.norm(p_unnormalized, axis=0) y1_unnormalized = self.f(dot(self.Wo1, p) \ + self.bo1[:, zeros(words_num - 1, dtype=int)]) y1 = y1_unnormalized / LA.norm(y1_unnormalized, axis=0) y2_unnormalized = self.f(dot(self.Wo2, p) \ + self.bo2[:, zeros(words_num - 1, dtype=int)]) y2 = y2_unnormalized / LA.norm(y2_unnormalized, axis=0) y1c1 = y1 - c1 y2c2 = y2 - c2 J = 1 / 2 * (sum_along_column(y1c1**2) + sum_along_column(y2c2**2)) # finding the pair with smallest reconstruction error for constructing tree J_minpos = J.argmin() J_min = J[J_minpos] reconstruction_error += J_min left_child = tree_nodes[tree_node_indices[J_minpos]] right_child = tree_nodes[tree_node_indices[J_minpos + 1]] y1_minus_c1 = y1c1[:, (J_minpos, )] y2_minus_c2 = y2c2[:, (J_minpos, )] y1_unnormalized_minpos = y1_unnormalized[:, (J_minpos, )] y2_unnormalized_minpos = y2_unnormalized[:, (J_minpos, )] node = InternalNode(sent_length + j, left_child, right_child, p[:, (J_minpos, )], p_unnormalized[:, (J_minpos, )], y1_minus_c1, y2_minus_c2, y1_unnormalized_minpos, y2_unnormalized_minpos) tree_nodes[sent_length + j] = node valid_indices = [ i for i in range(sent_length - j) if i != J_minpos + 1 ] words_embedded = words_embedded[:, valid_indices] words_embedded[:, (J_minpos, )] = p[:, (J_minpos, )] tree_node_indices = tree_node_indices[valid_indices] tree_node_indices[J_minpos] = sent_length + j return tree_nodes[-1], reconstruction_error
def process_la( src_rae_la, trg_rae_la, alpha, src_word_vectors, src_instances, src_total_internal_node, trg_word_vectors, trg_instances, trg_total_internal_node, bad_src_instances, bad_trg_instances, src_Xidx, trg_Xidx, src_hiero_map, trg_hiero_map ): total_rec_error = 0 total_sem_error = 0 # 初始化梯度参数 src_gradients_la = src_rae_la.get_zero_gradients_la() trg_gradients_la = trg_rae_la.get_zero_gradients_la() src_total_rec_error = 0 trg_total_rec_error = 0 src_total_sem_error = 0 trg_total_sem_error = 0 for i in xrange( len( src_instances ) ): src_instance = src_instances[i] trg_instance = trg_instances[i] bad_src_instance = bad_src_instances[i] bad_trg_instance = bad_trg_instances[i] if src_Xidx[0] in src_instance.words: src_words_embedded = src_word_vectors[src_instance.words] trg_words_embedded = trg_word_vectors[trg_instance.words] if src_Xidx[1] in src_instance.words: src_x1 = src_instance.words.index(src_Xidx[0]) src_x2 = src_instance.words.index(src_Xidx[1]) trg_x1 = trg_instance.words.index(trg_Xidx[0]) trg_x2 = trg_instance.words.index(trg_Xidx[1]) src_words_embedded[:,src_x1] = zeros_like( src_words_embedded[:,src_x1] ) src_words_embedded[:,src_x2] = zeros_like( src_words_embedded[:,src_x2] ) trg_words_embedded[:,trg_x1] = zeros_like( trg_words_embedded[:,trg_x1] ) trg_words_embedded[:,trg_x2] = zeros_like( trg_words_embedded[:,trg_x2] ) for i in xrange( len( src_instance.idx ) ): src_idx = src_instance.idx[i] src_idx = src_idx.strip().split( ',' ) if src_idx[0] in src_hiero_map: src_words_embedded[:,src_x1] += src_hiero_map[src_idx[0]] else: src_words_embedded[:,src_x1] = src_words_embedded[:,src_x1] / ( i + 1 ) if src_idx[1] in src_hiero_map: src_words_embedded[:,src_x2] += src_hiero_map[src_idx[1]] else: src_words_embedded[:,src_x2] = src_words_embedded[:,src_x2] / ( i + 1 ) src_words_embedded[:,src_x1] /= src_instance.freq src_words_embedded[:,src_x2] /= src_instance.freq src_root_node, src_rec_error = src_rae_la.forward_la( src_words_embedded ) for i in xrange( len( trg_instance.idx ) ): trg_idx = trg_instance.idx[i] trg_idx = trg_idx.strip().split( ',' ) if trg_idx[0] in trg_hiero_map: trg_words_embedded[:,trg_x1] += trg_hiero_map[trg_idx[0]] else: trg_words_embedded[:,trg_x1] = trg_words_embedded[:,trg_x1] / ( i + 1 ) if trg_idx[1] in trg_hiero_map: trg_words_embedded[:,trg_x2] += trg_hiero_map[trg_idx[1]] else: trg_words_embedded[:,trg_x2] = trg_words_embedded[:,trg_x2] / ( i + 1 ) trg_words_embedded[:,trg_x1] /= trg_instance.freq trg_words_embedded[:,trg_x2] /= trg_instance.freq trg_root_node, trg_rec_error = trg_rae_la.forward_la( trg_words_embedded ) else: #只包含x1 src_x1 = src_instance.words.index(src_Xidx[0]) trg_x1 = trg_instance.words.index(trg_Xidx[0]) #print words_embedded[:,x1].shape,hiero_map['0'].shape src_words_embedded[:,src_x1] = zeros_like( src_words_embedded[:,src_x1] ) trg_words_embedded[:,trg_x1] = zeros_like( trg_words_embedded[:,trg_x1] ) for i in xrange( len( src_instance.idx ) ): src_idx = src_instance.idx[i] if src_idx in src_hiero_map: src_words_embedded[:,src_x1] += src_hiero_map[src_idx] else: src_words_embedded[:,src_x1] += src_words_embedded[:,src_x1] / ( i + 1 ) src_words_embedded[:,src_x1] /= src_instance.freq src_root_node, src_rec_error = src_rae_la.forward_la( src_words_embedded ) for i in xrange( len( trg_instance.idx ) ): trg_idx = trg_instance.idx[i] if trg_idx in trg_hiero_map: trg_words_embedded[:,trg_x1] += trg_hiero_map[trg_idx] else: trg_words_embedded[:,trg_x1] += trg_words_embedded[:,trg_x1] / ( i + 1 ) trg_words_embedded[:,trg_x1] /= trg_instance.freq trg_root_node, trg_rec_error = trg_rae_la.forward_la( trg_words_embedded ) else: # 取出该短语中所有词向量,instance.words中的单词idx还原成words.embedded中的词向量矩阵n*word_num src_words_embedded = src_word_vectors[src_instance.words] trg_words_embedded = trg_word_vectors[trg_instance.words] src_root_node, src_rec_error = src_rae_la.forward_la( src_words_embedded ) trg_root_node, trg_rec_error = trg_rae_la.forward_la( trg_words_embedded ) src_hiero_map[src_instance.idx[0]] = src_root_node.p.reshape(src_word_vectors.embsize(),) trg_hiero_map[trg_instance.idx[0]] = trg_root_node.p.reshape(trg_word_vectors.embsize(),) # 取出该短语中所有词向量,instance.words中的单词idx还原成words.embedded中的词向量矩阵n*word_num bad_src_embedded = src_word_vectors[bad_src_instance] bad_trg_embedded = trg_word_vectors[bad_trg_instance] # 前向传播,计算错误 src_total_rec_error += src_rec_error * src_instance.freq trg_total_rec_error += trg_rec_error * trg_instance.freq bad_src_root, _ = src_rae_la.forward_la( bad_src_embedded ) bad_trg_root, _ = trg_rae_la.forward_la( bad_trg_embedded ) rec_s = alpha * src_instance.freq / src_total_internal_node rec_t = alpha * trg_instance.freq / trg_total_internal_node sem_s = ( 1 - alpha ) * src_instance.freq / src_total_internal_node sem_t = ( 1 - alpha ) * trg_instance.freq / trg_total_internal_node # Semantic Error # Source side src_yla_unnormalized = tanh( dot( src_rae_la.Wla, src_root_node.p ) + src_rae_la.bla ) src_yla = src_yla_unnormalized / LA.norm( src_yla_unnormalized, axis=0 ) src_ylapla = src_yla - trg_root_node.p src_sem_error = 0.5 * sum_along_column( src_ylapla**2 )[0] bad_src_ylapla = src_yla - bad_trg_root.p bad_src_sem_error = 0.5 * sum_along_column( bad_src_ylapla**2 )[0] src_sem_margin = (src_sem_error-bad_src_sem_error+1)*src_instance.freq src_sem_margin = max( 0.0, src_sem_margin ) if src_sem_margin == 0.0: soptimal = True else: soptimal = False src_total_sem_error += src_sem_margin # Target side trg_yla_unnormalized = tanh( dot( trg_rae_la.Wla, trg_root_node.p ) + trg_rae_la.bla ) trg_yla = trg_yla_unnormalized / LA.norm( trg_yla_unnormalized, axis=0 ) trg_ylapla = trg_yla - src_root_node.p trg_sem_error = 0.5 * sum_along_column( trg_ylapla**2 )[0] bad_trg_ylapla = trg_yla - bad_src_root.p bad_trg_sem_error = 0.5 * sum_along_column( bad_trg_ylapla**2 )[0] trg_sem_margin = (trg_sem_error-bad_trg_sem_error+1)*trg_instance.freq trg_sem_margin = max( 0.0, trg_sem_margin ) if trg_sem_margin == 0.0: toptimal = True else: toptimal = False trg_total_sem_error += trg_sem_margin # 反向传播计算梯度 src_rae_la.backward_la( src_root_node, bad_src_root, src_gradients_la, rec_s, sem_s, src_yla_unnormalized, src_ylapla, bad_src_ylapla, soptimal ) trg_rae_la.backward_la( trg_root_node, bad_trg_root, trg_gradients_la, rec_t, sem_t, trg_yla_unnormalized, trg_ylapla, bad_trg_ylapla, toptimal ) src_total_rec_error = src_total_rec_error * ( 1.0 / src_total_internal_node ) trg_total_rec_error = trg_total_rec_error * ( 1.0 / trg_total_internal_node ) src_total_sem_error = src_total_sem_error * ( 1.0 / src_total_internal_node ) trg_total_sem_error = trg_total_sem_error * ( 1.0 / trg_total_internal_node ) return src_total_rec_error, src_total_sem_error, src_gradients_la.to_row_vector_la(),\ trg_total_rec_error, trg_total_sem_error, trg_gradients_la.to_row_vector_la()