class GraphConvolution(Module): """ Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 Implementation taken from https://github.com/tkipf/pygcn """ def __init__(self, in_features, out_features, bias=True): super(GraphConvolution, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(torch.FloatTensor(in_features, out_features)) if bias: self.bias = Parameter(torch.FloatTensor(out_features)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.weight.size(1)) self.weight.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.uniform_(-stdv, stdv) def forward(self, input1, adj): input1 = input1.cuda() adj = adj.cuda() self.weight = Parameter(self.weight.cuda()) support = torch.mm(input1, self.weight) output = torch.spmm(adj, support) if self.bias is not None: return output + self.bias.cuda() else: return output def __repr__(self): return self.__class__.__name__ + ' (' \ + str(self.in_features) + ' -> ' \ + str(self.out_features) + ')'
def _sin_cos_enc(self, from_length, to_length, embedding_size): position_enc = np.array([[ pos / np.power(10000, 2 * i / embedding_size) for i in range(embedding_size) ] for pos in range(from_length, to_length)], dtype=np.float32) # put sinusodial on even position position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # put cosine on odd position position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) result = Parameter( torch.from_numpy(position_enc)) # Why is this a parameter? if next(self.embed.parameters()).is_cuda: result = result.cuda() return result
class Gumbel_Generator_nc_asy(nn.Module): def __init__(self, sz=10, del_num=1, temp=10, temp_drop_frac=0.9999): super(Gumbel_Generator_nc_asy, self).__init__() self.sz = sz self.del_num = del_num self.gen_matrix = Parameter( torch.rand(del_num * (2 * sz - del_num - 1), 2)) #cmy get only unknown part parameter self.temperature = temp self.temp_drop_frac = temp_drop_frac def drop_temp(self): # 降温过程 self.temperature = self.temperature * self.temp_drop_frac def sample_all(self, hard=False): self.logp = self.gen_matrix if use_cuda: self.logp = self.gen_matrix.cuda() out = gumbel_softmax(self.logp, self.temperature, hard) if hard: hh = torch.zeros( (self.del_num * (2 * self.sz - self.del_num - 1), 2)) for i in range(out.size()[0]): hh[i, out[i]] = 1 out = hh out = out[:, 0] if use_cuda: out = out.cuda() matrix = torch.zeros(self.sz, self.sz).cuda() left_mask = torch.ones(self.sz, self.sz) left_mask[:-self.del_num, :-self.del_num] = 0 left_mask = left_mask - torch.diag(torch.diag(left_mask)) un_index = left_mask.nonzero() matrix[(un_index[:, 0], un_index[:, 1])] = out out_matrix = matrix # out_matrix = out[:, 0].view(self.gen_matrix.size()[0], self.gen_matrix.size()[0]) return out_matrix def init(self, mean, var): init.normal_(self.gen_matrix, mean=mean, std=var)
class Tree(nn.Module): def __init__(self,depth,n_in_feature): super(Tree, self).__init__() self.depth = depth self.n_leaf = 2 ** (depth - 1) # used features in this tree n_used_feature = self.n_leaf - 1 onehot = np.eye(n_in_feature) using_idx = np.random.choice(np.arange(n_in_feature), n_used_feature, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type(torch.FloatTensor),requires_grad=False) def forward(self,x): """ :param x(Variable): [batch_size,n_features] :return: route probability (Variable): [batch_size,n_leaf] """ if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() #print(x.shape) feats = torch.mm(x,self.feature_mask) # ->[batch_size,n_used_feature] decision = torch.sigmoid(feats) # ->[batch_size,n_leaf - 1] decision = torch.unsqueeze(decision,dim=2) decision_comp = 1-decision decision = torch.cat((decision,decision_comp),dim=2) # -> [batch_size,n_leaf,2] # compute route probability batch_size = x.size()[0] _mu = Variable(x.data.new(batch_size,1,1).fill_(1.)) begin_idx = 0 end_idx = 1 for n_layer in range(0, self.depth - 1): _mu = _mu.view(batch_size,-1,1).repeat(1,1,2) _decision = decision[:, begin_idx:end_idx, :] # -> [batch_size,2**n_layer,2] _mu = _mu*_decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2 ** (n_layer+1) mu = _mu.view(batch_size,self.n_leaf) #print(mu[:, :5]) return mu
class LinearLayer(nn.Module): def __init__(self, in_features, out_features, initializer=nn.init.xavier_uniform_): super(LinearLayer, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(initializer( torch.Tensor(in_features, out_features))) def forward(self, input): # no bias if config.learning.cuda: return torch.mm(input.cuda(), self.weight.cuda()) else: return torch.mm(input, self.weight) def __repr__(self): return self.__class__.__name__ + ' (' \ + str(self.in_features) + ' -> ' \ + str(self.out_features) + ')'
class Tree(nn.Module): def __init__(self, depth, feature_length, vector_length, use_cuda = False): """ Args: depth (int): depth of the neural decision tree. feature_length (int): number of neurons in the last feature layer vector_length (int): length of the mean vector stored at each tree leaf node """ super(Tree, self).__init__() self.depth = depth self.n_leaf = 2 ** depth self.feature_length = feature_length self.vector_length = vector_length self.is_cuda = use_cuda onehot = np.eye(feature_length) # randomly use some neurons in the feature layer to compute decision function using_idx = np.random.choice(feature_length, self.n_leaf, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type(torch.FloatTensor),requires_grad=False) # a leaf node contains a mean vector and a covariance matrix self.mean = np.ones((self.n_leaf, self.vector_length)) # TODO: use k-means clusterring to perform leaf node initialization self.mu_cache = [] # use sigmoid function as the decision function self.decision = nn.Sequential(OrderedDict([ ('sigmoid', nn.Sigmoid()), ])) # used for leaf node update self.covmat = np.array([np.eye(self.vector_length) for i in range(self.n_leaf)]) # also stores the inverse of the covariant matrix for efficiency self.covmat_inv = np.array([np.eye(self.vector_length) for i in range(self.n_leaf)]) # also stores the determinant of the covariant matrix for efficiency self.factor = np.ones((self.n_leaf)) if not use_cuda: raise NotImplementedError self.mean = Parameter(torch.from_numpy(self.mean).type(torch.FloatTensor), requires_grad=False) # self.covmat = Parameter(torch.from_numpy(self.covmat).type(torch.FloatTensor), requires_grad=False) # self.covmat_inv = Parameter(torch.from_numpy(self.covmat_inv).type(torch.FloatTensor), requires_grad=False) # self.factor = Parameter(torch.from_numpy(self.factor).type(torch.FloatTensor), requires_grad=False) else: self.mean = Parameter(torch.from_numpy(self.mean).type(torch.FloatTensor).cuda(), requires_grad=False) self.covmat = Parameter(torch.from_numpy(self.covmat).type(torch.FloatTensor).cuda(), requires_grad=False) self.covmat_inv = Parameter(torch.from_numpy(self.covmat_inv).type(torch.FloatTensor).cuda(), requires_grad=False) self.factor = Parameter(torch.from_numpy(self.factor).type(torch.FloatTensor).cuda(), requires_grad=False) def forward(self, x, save_flag = False): """ Args: param x (Tensor): input feature batch of size [batch_size,n_features] Return: (Tensor): routing probability of size [batch_size,n_leaf] """ cache = {} # save some intermediate results for analysis if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() feats = torch.mm(x, self.feature_mask) # ->[batch_size,n_leaf] decision = self.decision(feats) # passed sigmoid->[batch_size,n_leaf] decision = torch.unsqueeze(decision,dim=2) # ->[batch_size,n_leaf,1] decision_comp = 1-decision decision = torch.cat((decision,decision_comp),dim=2) # -> [batch_size,n_leaf,2] # compute route probability # note: we do not use decision[:,0] if save_flag: cache['decision'] = decision[:,:,0] batch_size = x.size()[0] mu = x.data.new(batch_size,1,1).fill_(1.) begin_idx = 1 end_idx = 2 for n_layer in range(0, self.depth): # mu stores the probability a sample is routed at certain node # repeat it to be multiplied for left and right routing mu = mu.repeat(1, 1, 2) # the routing probability at n_layer _decision = decision[:, begin_idx:end_idx, :] # -> [batch_size,2**n_layer,2] mu = mu*_decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2 ** (n_layer+1) # merge left and right nodes to the same layer mu = mu.view(batch_size, -1, 1) mu = mu.view(batch_size, -1) if save_flag: cache['mu'] = mu if save_flag: return mu, cache else: return mu def pred(self, x): """ Predict a vector based on stored vectors and routing probability Args: param x (Tensor): input feature batch of size [batch_size, feature_length] Return: (Tensor): prediction [batch_size,vector_length] """ p = torch.mm(self(x), self.mean) return p def update_label_distribution(self, target_batch): """ compute new mean vector and covariance matrix based on a multivariate gaussian distribution Args: param target_batch (Tensor): target batch of size [batch_size, vector_length] """ target_batch = torch.cat(target_batch, dim = 0) mu = torch.cat(self.mu_cache, dim = 0) batch_size = len(mu) # no need for gradient computation with torch.no_grad(): leaf_prob_density = mu.data.new(batch_size, self.n_leaf) for leaf_idx in range(self.n_leaf): # vectorized code is used for efficiency temp = target_batch - self.mean[leaf_idx, :] leaf_prob_density[:, leaf_idx] = (self.factor[leaf_idx]*torch.exp(-0.5*(torch.mm(temp, self.covmat_inv[leaf_idx, :,:])*temp).sum(dim = 1))).clamp(FLT_MIN, FLT_MAX) # Tensor [batch_size, 1] nominator = (mu * leaf_prob_density).clamp(FLT_MIN, FLT_MAX) # [batch_size, n_leaf] denomenator = (nominator.sum(dim = 1).unsqueeze(1)).clamp(FLT_MIN, FLT_MAX) # add dimension for broadcasting zeta = nominator/denomenator # [batch_size, n_leaf] # new_mean if a weighted sum of all training samples new_mean = (torch.mm(target_batch.transpose(0, 1), zeta)/(zeta.sum(dim = 0).unsqueeze(0))).transpose(0, 1) # [n_leaf, vector_length] # allocate for new parameters new_covmat = new_mean.data.new(self.n_leaf, self.vector_length, self.vector_length) new_covmat_inv = new_mean.data.new(self.n_leaf, self.vector_length, self.vector_length) new_factor = new_mean.data.new(self.n_leaf) for leaf_idx in range(self.n_leaf): # new covariance matrix is a weighted sum of all covmats of each training sample weights = zeta[:, leaf_idx].unsqueeze(0) temp = target_batch - new_mean[leaf_idx, :] new_covmat[leaf_idx, :,:] = torch.mm(weights*(temp.transpose(0, 1)), temp)/(weights.sum()) # update cache (factor and inverse) for future use new_covmat_inv[leaf_idx, :,:] = new_covmat[leaf_idx, :,:].inverse() if new_covmat[leaf_idx, :,:].det() <= 0: print('Warning: singular matrix %d'%leaf_idx) new_factor[leaf_idx] = 1.0/max((torch.sqrt(new_covmat[leaf_idx, :,:].det())), FLT_MIN) # update parameters self.mean = Parameter(new_mean, requires_grad = False) self.covmat = Parameter(new_covmat, requires_grad = False) self.covmat_inv = Parameter(new_covmat_inv, requires_grad = False) self.factor = Parameter(new_factor, requires_grad = False) return def update_label_distribution_simple(self, target_batch): """ compute new mean vector based on a simple update rule inspired from traditional regression tree Args: param feat_batch (Tensor): feature batch of size [batch_size, feature_length] param target_batch (Tensor): target batch of size [batch_size, vector_length] """ # if self.is_cuda: # # move tensors to GPU # target_batch = target_batch.cuda() target_batch = torch.cat(target_batch, dim = 0) mu = torch.cat(self.mu_cache, dim = 0) # if self.is_cuda: # # move tensors to GPU # mu = mu.cuda() # target_batch = target_batch.cuda() with torch.no_grad(): # compute routing leaf probability for this batch #mu = self(feat_batch) + FLT_MIN # [batch_size, n_leaf] # new_mean if a weighted sum of all training samples new_mean = (torch.mm(target_batch.transpose(0, 1), mu)/(mu.sum(dim = 0).unsqueeze(0))).transpose(0, 1) # [n_leaf, vector_length] # update parameters self.mean = Parameter(new_mean, requires_grad = False) return
class Tree(nn.Module): def __init__(self, depth, n_in_feature, used_feature_rate, n_class, jointly_training=True): super(Tree, self).__init__() self.depth = depth self.n_leaf = 2**depth self.n_class = n_class self.jointly_training = jointly_training # used features in this tree n_used_feature = int(n_in_feature * used_feature_rate) onehot = np.eye(n_in_feature) using_idx = np.random.choice(np.arange(n_in_feature), n_used_feature, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor).cuda(), requires_grad=False) # leaf label distribution if jointly_training: self.pi = np.random.rand(self.n_leaf, n_class) self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=True) else: self.pi = np.ones((self.n_leaf, n_class)) / n_class self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=False) # decision self.decision = nn.Sequential( OrderedDict([ ('linear1', nn.Linear(n_used_feature, self.n_leaf)), ('sigmoid', nn.Sigmoid()), ])) def forward(self, x): """ :param x(Variable): [batch_size,n_features] :return: route probability (Variable): [batch_size,n_leaf] """ if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() feats = torch.mm(x, self.feature_mask) # ->[batch_size,n_used_feature] decision = self.decision(feats) # ->[batch_size,n_leaf] decision = torch.unsqueeze(decision, dim=2) decision_comp = 1 - decision decision = torch.cat((decision, decision_comp), dim=2) # -> [batch_size,n_leaf,2] # compute route probability # note: we do not use decision[:,0] batch_size = x.size()[0] _mu = Variable(x.data.new(batch_size, 1, 1).fill_(1.)) begin_idx = 1 end_idx = 2 for n_layer in range(0, self.depth): _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2) _decision = decision[:, begin_idx: end_idx, :] # -> [batch_size,2**n_layer,2] _mu = _mu * _decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2**(n_layer + 1) mu = _mu.view(batch_size, self.n_leaf) return mu def get_pi(self): if self.jointly_training: return F.softmax(self.pi, dim=-1) else: return self.pi def cal_prob(self, mu, pi): """ :param mu [batch_size,n_leaf] :param pi [n_leaf,n_class] :return: label probability [batch_size,n_class] """ p = torch.mm(mu, pi) return p def update_pi(self, new_pi): self.pi.data = new_pi
class Tree(nn.Module): def __init__(self, depth, feature_length, vector_length, use_cuda=False): """ Args: depth (int): depth of the neural decision tree. feature_length (int): number of neurons in the last feature layer vector_length (int): length of the mean vector stored at each tree leaf node """ super(Tree, self).__init__() self.depth = depth self.n_leaf = 2**depth self.feature_length = feature_length self.vector_length = vector_length self.is_cuda = use_cuda # used in leaf node update self.mu_cache = [] onehot = np.eye(feature_length) # randomly use some neurons in the feature layer to compute decision function self.using_idx = np.random.choice(feature_length, self.n_leaf, replace=False) self.feature_mask = onehot[self.using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor), requires_grad=False) # a leaf node contains a mean vector and a covariance matrix self.pi = np.zeros((self.n_leaf, self.vector_length)) if not use_cuda: self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=False) else: self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor).cuda(), requires_grad=False) # use sigmoid function as the decision function self.decision = nn.Sequential( OrderedDict([ ('sigmoid', nn.Sigmoid()), ])) def forward(self, x, save_flag=False): """ Args: param x (Tensor): input feature batch of size [batch_size,n_features] Return: (Tensor): routing probability of size [batch_size,n_leaf] """ # def debug_hook(grad): # print('This is a debug hook') # print(grad.shape) # print(grad) cache = {} # save some intermediate results for analysis if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() feats = torch.mm(x, self.feature_mask) # ->[batch_size,n_leaf] decision = self.decision(feats) # passed sigmoid->[batch_size,n_leaf] decision = torch.unsqueeze(decision, dim=2) # ->[batch_size,n_leaf,1] decision_comp = 1 - decision decision = torch.cat((decision, decision_comp), dim=2) # -> [batch_size,n_leaf,2] # for debug #decision.register_hook(debug_hook) # compute route probability # note: we do not use decision[:,0] # save some intermediate results for analysis if save_flag: cache['decision'] = decision[:, :, 0] batch_size = x.size()[0] mu = x.data.new(batch_size, 1, 1).fill_(1.) begin_idx = 1 end_idx = 2 for n_layer in range(0, self.depth): # mu stores the probability a sample is routed at certain node # repeat it to be multiplied for left and right routing mu = mu.repeat(1, 1, 2) # the routing probability at n_layer _decision = decision[:, begin_idx: end_idx, :] # -> [batch_size,2**n_layer,2] mu = mu * _decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2**(n_layer + 1) # merge left and right nodes to the same layer mu = mu.view(batch_size, -1, 1) mu = mu.view(batch_size, -1) if save_flag: return mu, cache else: return mu def pred(self, x): """ Predict a vector based on stored vectors and routing probability Args: param x (Tensor): input feature batch of size [batch_size, feature_length] Return: (Tensor): prediction [batch_size,vector_length] """ p = torch.mm(self(x), self.pi) return p def get_pi(self): return self.pi def cal_prob(self, mu, pi): """ :param mu [batch_size,n_leaf] :param pi [n_leaf,n_class] :return: label probability [batch_size,n_class] """ p = torch.mm(mu, pi) return p def update_label_distribution(self, target_batches): """ compute new mean vector based on a simple update rule inspired from traditional regression tree Args: param feat_batch (Tensor): feature batch of size [batch_size, feature_length] param target_batch (Tensor): target batch of size [batch_size, vector_length] """ with torch.no_grad(): new_pi = self.pi.data.new(self.n_leaf, self.vector_length).fill_( 0.) # Tensor [n_leaf,n_class] for mu, target in zip(self.mu_cache, target_batches): prob = torch.mm(mu, self.pi) # [batch_size,n_class] _target = target.unsqueeze(1) # [batch_size,1,n_class] _pi = self.pi.unsqueeze(0) # [1,n_leaf,n_class] _mu = mu.unsqueeze(2) # [batch_size,n_leaf,1] _prob = torch.clamp(prob.unsqueeze(1), min=1e-6, max=1.) # [batch_size,1,n_class] _new_pi = torch.mul(torch.mul(_target, _pi), _mu) / _prob # [batch_size,n_leaf,n_class] new_pi += torch.sum(_new_pi, dim=0) new_pi = F.softmax(new_pi, dim=1).data self.pi = Parameter(new_pi, requires_grad=False) return
class Tree(nn.Module): def __init__(self, depth, n_in_feature, used_feature_rate): super(Tree, self).__init__() self.depth = depth self.n_leaf = 2**depth n_used_feature = int(n_in_feature * used_feature_rate) onehot = np.eye(n_in_feature) np.random.seed(0) using_idx = np.random.choice(np.arange(n_in_feature), n_used_feature, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor), requires_grad=False) self.pi = np.ones((self.n_leaf, 2)) / 2 self.pi = Parameter(torch.from_numpy(self.pi).type(torch.FloatTensor), requires_grad=False) self.decision = nn.Sequential( OrderedDict([ ('linear1', nn.Linear(in_features=n_used_feature, out_features=self.n_leaf)), ('sigmoid', nn.Sigmoid()), ])) def forward(self, x): if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() feats = torch.mm(x, self.feature_mask) decision = self.decision(feats) decision = torch.unsqueeze(decision, dim=2) decision_comp = 1 - decision decision = torch.cat((decision, decision_comp), dim=2) batch_size = x.size()[0] _mu = Variable(x.data.new(batch_size, 1, 1).fill_(1.)) begin_idx = 1 end_idx = 2 for n_layer in range(self.depth): _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2) _decision = decision[:, begin_idx:end_idx, :] _mu = _mu * _decision begin_idx = end_idx end_idx = begin_idx + 2**(n_layer + 1) mu = _mu.view(batch_size, self.n_leaf) return mu def get_pi(self): return self.pi def cal_prob(self, mu, pi): p = torch.mm(mu, pi) return p def update_pi(self, new_pi): self.pi.data = new_pi
class Tree(nn.Module): def __init__(self, depth, n_in_feature, used_feature_rate, n_class, jointly_training=True): super(Tree, self).__init__() self.depth = depth self.n_leaf = 2**depth self.n_class = n_class self.jointly_training = jointly_training # used features in this tree n_used_feature = int(n_in_feature * used_feature_rate) onehot = np.eye(n_in_feature) using_idx = np.random.choice(np.arange(n_in_feature), n_used_feature, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor), requires_grad=False) # initialize leaf label distribution pi = [n_leaf, n_class] if jointly_training: # random distributed between (0, 1) self.pi = np.random.rand(self.n_leaf, n_class) self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=True) else: # equally distributed => 1/n_class self.pi = np.ones((self.n_leaf, n_class)) / n_class self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=False) # split decision self.decision = nn.Sequential( OrderedDict([ ('linear1', nn.Linear(n_used_feature, self.n_leaf)), ('sigmoid', nn.Sigmoid()), ])) def forward(self, x): """ :param x(Variable): [batch_size,n_features] :return: route probability (Variable): [batch_size,n_leaf] """ if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() # randomly select subset of features feats = torch.mm(x, self.feature_mask) # ->[batch_size,n_used_feature] # linear + sigmoid converts features into Leaf Num. decision = self.decision( feats ) # ->[batch_size,n_leaf] [1000, 1024] # num_used_feature = num_leafs #print('First convert features into leaf size: ', decision.size()) decision = torch.unsqueeze(decision, dim=2) # add one-dim #print('Squeeze decision: ', decision.size()) [1000, 1024, 1] decision_comp = 1 - decision decision = torch.cat((decision, decision_comp), dim=2) # -> [batch_size,n_leaf,2] print('Concate decision: ', decision.size()) #[1000, 1024, 2] print('-----------------------') # compute route probability # note: we do not use decision[:,0] batch_size = x.size()[0] _mu = Variable(x.data.new(batch_size, 1, 1).fill_(1.)) begin_idx = 1 end_idx = 2 for n_layer in range(0, self.depth): # view: reshape tensor into [batch, -1, 1], 每个batch里面n条样本,每条样本一个mu # repeat: 每条样本的mu都重复一次,变成2个, 与decision的dim一致 print('LAYER: ', n_layer) _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2) print('mu shape: ', _mu.size()) _decision = decision[:, begin_idx: end_idx, :] # -> [batch_size,2**n_layer,2] print('_begin_idx: ', begin_idx, 'end_idx: ', end_idx) _mu = _mu * _decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2**(n_layer + 1) print('===============') print('Done LOOP....') mu = _mu.view(batch_size, self.n_leaf) # [batch_size, n_leaf] return mu def get_pi(self): if self.jointly_training: return F.softmax(self.pi, dim=-1) # label distribution else: return self.pi def cal_prob(self, mu, pi): """ :param mu [batch_size,n_leaf] :param pi [n_leaf,n_class] :return: label probability [batch_size,n_class] """ p = torch.mm(mu, pi) # tree prob P_T return p def update_pi(self, new_pi): self.pi.data = new_pi
class ResNet(nn.Module): def __init__(self, block, layers, num_classes, grayscale): self.num_classes = num_classes self.inplanes = 64 feature_length = 224 onehot = np.eye(feature_length) # randomly use some neurons in the feature layer to compute decision function # a leaf node contains a mean vector and a covariance matrix if grayscale: in_dim = 1 else: in_dim = 3 super(ResNet, self).__init__() using_idx = np.random.choice(feature_length, 50, replace=False) self.feature_mask = onehot[using_idx].T self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor), requires_grad=False) self.conv1 = nn.Conv2d(in_dim, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7, stride=1, padding=2) self.fc = nn.Linear(2048, 1, bias=False) self.linear_1_bias = nn.Parameter(torch.zeros(1).float()) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, (2. / n)**.5) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def pred(self, x): #p = torch.mm(self(x), self.mean) return self(x) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): if not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() xt = torch.ones((30, 3, 50, 50)) i = 0 for xi in x: j = 0 for xj in xi: xt[i][j] = torch.mm( torch.mm(xj, self.feature_mask).T, self.feature_mask) j = j + 1 i = i + 1 x = xt.cuda() # x = torch.mm(x, self.feature_mask) x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) # x = self.avgpool(x) x = x.view(x.size(0), -1) logits = self.fc(x) logits = logits + self.linear_1_bias probas = torch.sigmoid(logits) # print(probas.shape) # print(logits.shape) return probas
class Tree(nn.Module): def __init__(self, depth, feature_length, vector_length, use_cuda=False): """ depth (int): depth of the neural decision tree. feature_length (int): number of neurons in the last feature layer vector_length (int): length of the mean vector stored at each tree leaf node #GG I think vector length is actually the number of classes """ super(Tree, self).__init__() self.depth = depth self.n_leaf = 2**depth self.feature_length = feature_length self.vector_length = vector_length self.is_cuda = use_cuda # used in leaf node update self.mu_cache = [] #GG # self.nu_cache = [] onehot = np.eye(feature_length) #GG^ why onehot? returns matrix feature_lengthXfeature_length # randomly use some neurons in the feature layer to compute decision function self.using_idx = np.random.choice(feature_length, self.n_leaf, replace=False) #GG^ create a random vector of length n_leaf composed of numbers out of feature_length #GG actually choosing the features for each leaf self.feature_mask = onehot[self.using_idx].T #GG^ feature mask is a vector with 1 at the location of each of the noted random variables self.feature_mask = Parameter(torch.from_numpy(self.feature_mask).type( torch.FloatTensor), requires_grad=False) # a leaf node contains a mean vector and a covariance matrix self.pi = np.ones( (self.n_leaf, self.vector_length)) / self.vector_length #GG^ pi i a matrix that has n_leaf rows and vector_length columns. each cell determines #GG the probability of a certain leaf to refer to a feature if not use_cuda: self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor), requires_grad=False) else: self.pi = Parameter(torch.from_numpy(self.pi).type( torch.FloatTensor).cuda(), requires_grad=False) # use sigmoid function as the decision function self.decision = nn.Sequential( OrderedDict([ ('sigmoid', nn.Sigmoid()), ])) def forward(self, x, save_flag=False, wavelet=None): """ Args: param x (Tensor): input feature batch of size [batch_size,n_features] Return: (Tensor): routing probability of size [batch_size,n_leaf] #GG basically, returns mu """ # def debug_hook(grad): # print('This is a debug hook') # print(grad.shape) # print(grad) cache = {} # save some intermediate results for analysis if x.is_cuda and not self.feature_mask.is_cuda: self.feature_mask = self.feature_mask.cuda() feats = torch.mm(x, self.feature_mask) # ->[batch_size,n_leaf] #GG^ x[batch size, feature_length] mm with feature_mask[feature_length,n_leaf] decision = self.decision(feats) # passed sigmoid->[batch_size,n_leaf] decision = torch.unsqueeze(decision, dim=2) # ->[batch_size,n_leaf,1] decision_comp = 1 - decision decision = torch.cat((decision, decision_comp), dim=2) # -> [batch_size,n_leaf,2] # for debug #decision.register_hook(debug_hook) # compute route probability # note: we do not use decision[:,0] # save some intermediate results for analysis if save_flag: cache['decision'] = decision[:, :, 0] batch_size = x.size()[0] mu = x.data.new(batch_size, 1, 1).fill_(1.) #GG^ new creates a new tensor of the same type and same CUDA. #GG .fill_(1.) fills a tensor with 1.s begin_idx = 1 end_idx = 2 for n_layer in range(0, self.depth): # mu stores the probability a sample is routed at certain node # repeat it to be multiplied for left and right routing mu = mu.repeat(1, 1, 2) # the routing probability at n_layer _decision = decision[:, begin_idx: end_idx, :] # -> [batch_size,2**n_layer,2] #GG^ original decision tensor is [feature length, leaf_number,decision&compliment] mu = mu * _decision # -> [batch_size,2**n_layer,2] begin_idx = end_idx end_idx = begin_idx + 2**(n_layer + 1) # merge left and right nodes to the same layer mu = mu.view(batch_size, -1, 1) #GG print(f'begin_idx: {begin_idx}, end_idx {end_idx}, delta {-begin_idx+end_idx}') mu = mu.view(batch_size, -1) if save_flag: return mu, cache else: return mu def pred(self, x): """ Predict a vector based on stored vectors and routing probability Args: param x (Tensor): input feature batch of size [batch_size, feature_length] Return: (Tensor): prediction [batch_size,vector_length] """ p = torch.mm(self(x), self.pi) return p def get_pi(self): return self.pi def cal_prob(self, mu, pi): """ :param mu [batch_size,n_leaf] :param pi [n_leaf,n_class] :return: label probability [batch_size,n_class] """ p = torch.mm(mu, pi) # print('hi mama!') return p def update_label_distribution(self, target_batches): """ compute new mean vector based on a simple update rule inspired from traditional regression tree Args: param feat_batch (Tensor): feature batch of size [batch_size, feature_length] param target_batch (Tensor): target batch of size [batch_size, vector_length] """ with torch.no_grad(): new_pi = self.pi.data.new(self.n_leaf, self.vector_length).fill_( .0) ##GG 1/self.vector_length) # Tensor [n_leaf,n_class] for mu, target in zip(self.mu_cache, target_batches): prob = torch.mm(mu, self.pi) # [batch_size,n_class] _target = target.unsqueeze(1) # [batch_size,1,n_class] _pi = self.pi.unsqueeze(0) # [1,n_leaf,n_class] _mu = mu.unsqueeze(2) # [batch_size,n_leaf,1] _prob = torch.clamp(prob.unsqueeze(1), min=1e-6, max=1.) # [batch_size,1,n_class] _new_pi = torch.mul(torch.mul(_target, _pi), _mu) / _prob # [batch_size,n_leaf,n_class] new_pi += torch.sum(_new_pi, dim=0) # test #import numpy as np #if np.any(np.isnan(new_pi.cpu().numpy())): # print(new_pi) # test new_pi = F.softmax(new_pi, dim=1).data #GG?? self.pi = Parameter(new_pi, requires_grad=False) return