def __init__(self, hparams): super(PositionwiseFF, self).__init__() self.hparams = hparams self.w_1 = nn.Linear(hparams.d_model, hparams.d_inner, bias=False) self.w_2 = nn.Linear(hparams.d_inner, hparams.d_model, bias=False) self.dropout = nn.Dropout(hparams.dropout) self.relu = nn.ReLU() self.layer_norm = LayerNormalization(hparams.d_model, hparams) init_param(self.w_1.weight, init_type="uniform", init_range=hparams.init_range) init_param(self.w_2.weight, init_type="uniform", init_range=hparams.init_range)
def __init__(self, input_dim, output_dim, use_bias=True, name=''): super(SigmoidLinear, self).__init__() self.name = name self.use_bias = use_bias self.W = init_param((input_dim, output_dim)) self.params[self.name + '/W'] = self.W if self.use_bias: self.b = init_param((output_dim, )) self.params[self.name + '/b'] = self.b self.input = None self.output = None
def __init__(self, hparams): super(MultiHeadAttn, self).__init__() self.hparams = hparams self.attention = ScaledDotProdAttn(hparams) self.layer_norm = LayerNormalization(hparams.d_model, hparams) # projection of concatenated attn n_heads = self.hparams.n_heads d_model = self.hparams.d_model d_q = self.hparams.d_k d_k = self.hparams.d_k d_v = self.hparams.d_v # d_q == d_k == k_v self.q = nn.Linear(d_model, n_heads * d_q, bias=False) self.k = nn.Linear(d_model, n_heads * d_k, bias=False) self.v = nn.Linear(d_model, n_heads * d_v, bias=False) init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range) init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range) init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range) # Q, K, V = [], [], [] # for head_id in range(n_heads): # q = nn.Linear(d_model, d_q, bias=False) # k = nn.Linear(d_model, d_k, bias=False) # v = nn.Linear(d_model, d_v, bias=False) # init_param(q.weight, init_type="uniform", init_range=hparams.init_range) # init_param(k.weight, init_type="uniform", init_range=hparams.init_range) # init_param(v.weight, init_type="uniform", init_range=hparams.init_range) # Q.append(q) # K.append(k) # V.append(v) # self.Q = nn.ModuleList(Q) # self.K = nn.ModuleList(K) # self.V = nn.ModuleList(V) if self.hparams.cuda: #self.Q = self.Q.cuda() #self.K = self.K.cuda() #self.V = self.V.cuda() self.q = self.q.cuda() self.k = self.k.cuda() self.v = self.v.cuda() self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False) init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range) if self.hparams.cuda: self.w_proj = self.w_proj.cuda()
def train(x_train, y_train): num_of_vectors = m.shape[1] vectors = [] for i in range(num_of_vectors): vector = {} vector['i'] = i vector['wi'] = init_param(x_train[0]) vector['win'], vector['lose'] = get_win_lose_classes_by_column(m, i) vectors.append(vector) for i in range(len(y_train)): x = x_train[i] y = y_train[i] for v in vectors: v['wi'], lossi = train_custom(v, x, y, i) weights = [v['wi'] for v in vectors] return weights
def __init__(self, hparams): super(RelativeMultiHeadAttn, self).__init__() self.hparams = hparams self.attention = ScaledDotProdAttn(hparams) self.layer_norm = LayerNormalization(hparams.d_model, hparams) self.temp = np.power(hparams.d_model, 0.5) self.softmax = nn.Softmax(dim=-1) self.pos_emb = PositionalEmbedding(hparams) self.dropout = nn.Dropout(hparams.dropout) # projection of concatenated attn n_heads = self.hparams.n_heads d_model = self.hparams.d_model d_q = self.hparams.d_k d_k = self.hparams.d_k d_v = self.hparams.d_v #self.q = nn.Linear(d_model, n_heads * d_q, bias=False) #self.k = nn.Linear(d_model, n_heads * d_k, bias=False) #self.v = nn.Linear(d_model, n_heads * d_v, bias=False) #self.r = nn.Linear(d_model, n_heads * d_v, bias=False) #init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range) #init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range) #init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range) #init_param(self.r.weight, init_type="uniform", init_range=hparams.init_range) Q, K, V, R = [], [], [], [] for head_id in range(n_heads): q = nn.Linear(d_model, d_q, bias=False) k = nn.Linear(d_model, d_k, bias=False) v = nn.Linear(d_model, d_v, bias=False) r = nn.Linear(self.hparams.d_word_vec, d_k, bias=False) init_param(q.weight, init_type="uniform", init_range=hparams.init_range) init_param(k.weight, init_type="uniform", init_range=hparams.init_range) init_param(v.weight, init_type="uniform", init_range=hparams.init_range) init_param(r.weight, init_type="uniform", init_range=hparams.init_range) Q.append(q) K.append(k) V.append(v) R.append(r) self.Q = nn.ModuleList(Q) self.K = nn.ModuleList(K) self.V = nn.ModuleList(V) self.R = nn.ModuleList(R) if self.hparams.cuda: self.Q = self.Q.cuda() self.K = self.K.cuda() self.V = self.V.cuda() self.R = self.R.cuda() #self.q = self.q.cuda() #self.k = self.k.cuda() #self.r = self.r.cuda() if self.hparams.relative_pos_c: #self.u = nn.Linear(1, d_q, bias=False) self.u = nn.Linear(d_q, 1, bias=False) init_param(self.u.weight, init_type="uniform", init_range=hparams.init_range) if self.hparams.relative_pos_d: #self.v = nn.Linear(1, d_q, bias=False) self.v = nn.Linear(d_q, 1, bias=False) init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range) self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False) init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range) if self.hparams.cuda: self.w_proj = self.w_proj.cuda() if self.hparams.relative_pos_c: self.u = self.u.cuda() if self.hparams.relative_pos_d: self.v = self.v.cuda()
with open("result.txt", "a") as f: f.write("episode{0}\tReward: {1:.2f}".format( i_episode, val_reward)) f.write("\n") RM.reset() # break with open("total_reward.txt", "a") as f: f.write("{0}\t{1}".format(i_episode, total_reward)) f.write("\n") if __name__ == "__main__": args = parser.parse_args() model = DQN().to(device) init_param(model) train(model) # NOTE: this is required for the ``fork`` method to work # model = DQN().to(device) # num_processes = 2 # model.share_memory() # processes = [] # for rank in range(num_processes): # p = mp.Process(target=train, args=(model,)) # p.start() # processes.append(p) # for p in processes: # p.join()
def __init__(self, hparams, enc=False, n_layer=-1): super(RelativeMultiHeadAttn, self).__init__() self.hparams = hparams self.set_sep = (n_layer in self.hparams.sep_layer) and enc self.enc = enc self.n_layer = n_layer #self.layer_norm = LayerNormalization(hparams.d_model, hparams) self.layer_norm = torch.nn.LayerNorm(hparams.d_model) self.temp = np.power(hparams.d_model, 0.5) self.softmax = nn.Softmax(dim=2) self.pos_emb = PositionalEmbedding(hparams) self.dropout = nn.Dropout(hparams.dropout) # projection of concatenated attn n_heads = self.hparams.n_heads d_model = self.hparams.d_model d_q = self.hparams.d_k d_k = self.hparams.d_k d_v = self.hparams.d_v self.q = nn.Linear(d_model, n_heads * d_q, bias=False) self.k = nn.Linear(d_model, n_heads * d_k, bias=False) self.v = nn.Linear(d_model, n_heads * d_v, bias=False) init_param(self.q.weight, init_type="uniform", init_range=hparams.init_range) init_param(self.k.weight, init_type="uniform", init_range=hparams.init_range) init_param(self.v.weight, init_type="uniform", init_range=hparams.init_range) if self.hparams.sep_head_weight and self.enc: self.head_w = [] for i in range(self.hparams.lan_size): h_w = nn.Linear(d_model, n_heads, bias=False) init_param(h_w.weight, init_type="uniform", init_range=hparams.init_range) self.head_w.append(h_w) self.head_w = nn.ModuleList(self.head_w) if self.hparams.cuda: self.head_w = self.head_w.cuda() if self.enc and self.n_layer < self.hparams.max_loc_layer: self.r = [] r = nn.Linear(d_model, n_heads * d_v, bias=False) init_param(r.weight, init_type="uniform", init_range=hparams.init_range) self.r.append(r) self.r = nn.ModuleList(self.r) if self.hparams.cuda: self.q = self.q.cuda() self.k = self.k.cuda() self.v = self.v.cuda() if self.enc and self.n_layer < self.hparams.max_loc_layer: self.r = self.r.cuda() if self.hparams.relative_pos_c: ub = nn.Linear(d_q, 1, bias=False) init_param(ub.weight, init_type="uniform", init_range=hparams.init_range) self.ub = ub if self.hparams.relative_pos_d and (self.enc and self.n_layer < self.hparams.max_loc_layer): self.vb = [] vb = nn.Linear(d_q, 1, bias=False) init_param(vb.weight, init_type="uniform", init_range=hparams.init_range) self.vb.append(vb) self.vb = nn.ModuleList(self.vb) if self.hparams.cuda: self.vb = self.vb.cuda() self.w_proj = nn.Linear(n_heads * d_v, d_model, bias=False) init_param(self.w_proj.weight, init_type="uniform", init_range=hparams.init_range) if self.hparams.cuda: self.w_proj = self.w_proj.cuda() if self.hparams.relative_pos_c: self.ub = self.ub.cuda()
def train_continuous_mnist(args, model, device, train_loader, test_loader): ava_test = [] weight_lst = utils.weight_lst(model) w_mat_lst, m_mat_lst, a_mat_lst, b_mat_lst, avg_psi_mat_lst, e_a_mat_lst, e_b_mat_lst = \ utils.init_param(weight_lst, args.s_init, device, True, args.alpha) for task in range(len(test_loader)): for epoch in range(1, args.epochs + 1): for batch_idx, (data, target) in enumerate(train_loader[0]): model.train() data, target = data.to(device), target.to(device) data = data.view(-1, 784) for mc_iter in range(args.train_mc_iters): # Phi ~ MN(0,I,I) phi_mat_lst = utils.gen_phi(w_mat_lst, device) # W = M +B*Phi*A^t utils.randomize_weights(weight_lst, w_mat_lst, m_mat_lst, a_mat_lst, b_mat_lst, phi_mat_lst) output = model(data) criterion = nn.CrossEntropyLoss() loss = args.batch_size * criterion(output, target) utils.zero_grad(weight_lst) loss.backward() grad_mat_lst = utils.weight_grad(weight_lst, device) utils.aggregate_grads(args, avg_psi_mat_lst, grad_mat_lst) utils.aggregate_e_a(args, e_a_mat_lst, grad_mat_lst, b_mat_lst, phi_mat_lst) utils.aggregate_e_b(args, e_b_mat_lst, grad_mat_lst, a_mat_lst, phi_mat_lst) # M = M - B*B^t*avg_Phi*A*A^t utils.update_m(m_mat_lst, a_mat_lst, b_mat_lst, avg_psi_mat_lst, args.eta) utils.update_a_b(a_mat_lst, b_mat_lst, e_a_mat_lst, e_b_mat_lst, device, args.use_gsvd) utils.zero_matrix(avg_psi_mat_lst, e_a_mat_lst, e_b_mat_lst) model.eval() with torch.no_grad(): correct = 0 for data, target in test_loader[task]: data, target = data.to(device), target.to(device) data = data.view(-1, 784) for mc_iter in range(args.train_mc_iters): phi_mat_lst = utils.gen_phi(w_mat_lst, device) utils.randomize_weights(weight_lst, w_mat_lst, m_mat_lst, a_mat_lst, b_mat_lst, phi_mat_lst) output = model(data) pred = output.argmax( dim=1, keepdim=True ) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_acc = 100. * correct / (len(test_loader[task].dataset) * args.train_mc_iters) print( '\nTask num {}, Epoch num {} Test Accuracy: {:.2f}%\n'.format( task, epoch, test_acc)) test_acc_lst = [] for i in range(task + 1): model.eval() with torch.no_grad(): correct = 0 for data, target in test_loader[i]: data, target = data.to(device), target.to(device) data = data.view(-1, 784) for mc_iter in range(args.train_mc_iters): phi_mat_lst = utils.gen_phi(w_mat_lst, device) utils.randomize_weights(weight_lst, w_mat_lst, m_mat_lst, a_mat_lst, b_mat_lst, phi_mat_lst) output = model(data) pred = output.argmax( dim=1, keepdim=True ) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_acc = 100. * correct / (len(test_loader[i].dataset) * args.train_mc_iters) test_acc_lst.append(test_acc) print('\nTraning task Num: {} Test Accuracy of task {}: {:.2f}%\n'. format(task, i, test_acc)) print(test_acc_lst) ava_test.append(np.average(np.asanyarray(test_acc_lst))) return ava_test