def power_iteration(W, u_, update=True, eps=1e-12): # Lists holding singular vectors and values Wt = torch.Tensor(W).t() us, vs, svs = [], [], [] for i, u in enumerate(u_): # Run one step of the power iteration with torch.no_grad(): if W.shape[1] == 27: a = 1 v = torch.matmul(u, W) # if (W.shape[0]==u.shape[1]) : # v = torch.matmul(u, W) # else: # v = torch.matmul(u, Wt) # Run Gram-Schmidt to subtract components of all other singular vectors v = F.normalize(gram_schmidt(v, vs), eps=eps) # Add to the list vs += [v] # Update the other singular vector u = torch.matmul(v, Wt) # if (W.shape[0]!=v.shape[1]): # u = torch.matmul(v, Wt ) # else: # u = torch.matmul(v, W) # Run Gram-Schmidt to subtract components of all other singular vectors u = F.normalize(gram_schmidt(u, us), eps=eps) # Add to the list us += [u] if update: torch.copy(u, u_[i]) # u_[i][:] = u # Compute this singular value and add it to the list svs += [torch.squeeze(torch.matmul(torch.matmul(v, Wt), u.t()))] # if (W.shape[0]!=v.shape[1]): # svs += [torch.squeeze(torch.matmul(torch.matmul(v, Wt ), u.t() ))] # else: # svs += [torch.squeeze(torch.matmul(torch.matmul(v, W), u.t()))] #svs += [torch.sum(F.linear(u, W.transpose(0, 1)) * v)] return svs, us, vs
def matmul(self, y): return torch.matmul(self, y)
import torch torch.manual_seed(0) a = torch.randn(70839, 64 ) b = torch.randn(64, 64, requires_grad=True) print(torch.argmax(torch.matmul(a,b))) import paddorch import paddle a2 =paddorch.Tensor(a.detach().cpu().numpy()) b2 = paddorch.Tensor(b.detach().cpu().numpy()) print(paddle.argmax(paddorch.matmul(a2,b2) ))
def train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, sw, opt): """ one epoch training for moco """ n_batch = train_loader.dataset.total // opt.batch_size no_update_debug = False if no_update_debug: model.eval() contrast.eval() else: model.train() model_ema.eval() def set_bn_train(m): classname = m.__class__.__name__ if classname.find("BatchNorm") != -1: m.train() if not no_update_debug: model_ema.apply(set_bn_train) batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() epoch_loss_meter = AverageMeter() prob_meter = AverageMeter() graph_size = AverageMeter() gnorm_meter = AverageMeter() max_num_nodes = 0 max_num_edges = 0 end = time.time() if no_update_debug: graph_q, graph_k = train_loader.dataset[0] graph_q2, graph_k2 = train_loader.dataset[1] graph_q, graph_k = dgl.batch([graph_q, graph_q2 ]), dgl.batch([graph_k, graph_k2]) for idx, batch in enumerate(train_loader): data_time.update(time.time() - end) if not no_update_debug: graph_q, graph_k = batch # graph_q.to(torch.device(opt.gpu)) # graph_k.to(torch.device(opt.gpu)) ##inject testing bsz = graph_q.batch_size if opt.moco: # ===================Moco forward===================== feat_q = model(graph_q) with torch.no_grad(): feat_k = model_ema(graph_k) out = contrast(feat_q, feat_k) prob = out[:, 0].mean() else: # ===================Negative sampling forward===================== feat_q = model(graph_q) feat_k = model(graph_k) out = torch.matmul(feat_k, feat_q.t()) / opt.nce_t prob = out[range(graph_q.batch_size), range(graph_q.batch_size)].mean() assert feat_q.shape == (graph_q.batch_size, opt.hidden_size) # ===================backward===================== optimizer.zero_grad() loss = criterion(out) #clip before the backward # [torch.nn.utils.clip_by_norm(p, opt.clip_norm) for p in model.parameters() ] if not no_update_debug: loss.backward() grad_norm = clip_grad_norm(model.parameters(), 0) global_step = epoch * n_batch + idx lr_this_step = opt.learning_rate * warmup_linear( global_step / (opt.epochs * n_batch), 0.1) if lr_this_step is not None: optimizer.set_lr(lr_this_step) # for param_group in optimizer.param_groups: # param_group["lr"] = lr_this_step optimizer.step() # if not no_update_debug: # optimizer.minimize(loss) if no_update_debug: print(loss.item()) # ===================meters===================== loss_meter.update(loss.item(), bsz) epoch_loss_meter.update(loss.item(), bsz) prob_meter.update(prob.item(), bsz) graph_size.update( (graph_q.number_of_nodes() + graph_k.number_of_nodes()) / 2.0 / bsz, 2 * bsz) gnorm_meter.update(grad_norm, 1) max_num_nodes = max(max_num_nodes, graph_q.number_of_nodes()) max_num_edges = max(max_num_edges, graph_q.number_of_edges()) if opt.moco: if not no_update_debug: moment_update(model, model_ema, opt.alpha) batch_time.update(time.time() - end) end = time.time() # del graph_q, graph_k, feat_q, feat_k # print info if (idx + 1) % opt.print_freq == 0: mem = psutil.virtual_memory() # print(f'{idx:8} - {mem.percent:5} - {mem.free/1024**3:10.2f} - {mem.available/1024**3:10.2f} - {mem.used/1024**3:10.2f}') # mem_used.append(mem.used/1024**3) print("Train: [{0}][{1}/{2}]\t" "BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "DT {data_time.val:.3f} ({data_time.avg:.3f})\t" "loss {loss.val:.3f} ({loss.avg:.3f})\t" "prob {prob.val:.3f} ({prob.avg:.3f})\t" "GS {graph_size.val:.3f} ({graph_size.avg:.3f})\t" "mem {mem:.3f}".format( epoch, idx + 1, n_batch, batch_time=batch_time, data_time=data_time, loss=loss_meter, prob=prob_meter, graph_size=graph_size, mem=mem.used / 1024**3, )) # print(out[0].abs().max()) # tensorboard logger if (idx + 1) % opt.tb_freq == 0: global_step = epoch * n_batch + idx sw.add_scalar("moco_loss", loss_meter.avg, global_step) sw.add_scalar("moco_prob", prob_meter.avg, global_step) sw.add_scalar("graph_size", graph_size.avg, global_step) sw.add_scalar("graph_size/max", max_num_nodes, global_step) sw.add_scalar("graph_size/max_edges", max_num_edges, global_step) sw.add_scalar("gnorm", gnorm_meter.avg, global_step) sw.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], global_step) loss_meter.reset() prob_meter.reset() graph_size.reset() gnorm_meter.reset() max_num_nodes, max_num_edges = 0, 0 return epoch_loss_meter.avg