def __init__(self, num_class, num_segments, modality, base_model='resnet101', new_length=None, consensus_type='avg', before_softmax=True, dropout=0.8, crop_num=1, partial_bn=True): super(TSN, self).__init__() self.modality = modality self.num_segments = num_segments self.reshape = True self.before_softmax = before_softmax self.dropout = dropout self.crop_num = crop_num self.consensus_type = consensus_type if not before_softmax and consensus_type != 'avg': raise ValueError("Only avg consensus can be used after Softmax") if new_length is None: self.new_length = 1 if modality == "RGB" else 5 else: self.new_length = new_length log((""" Initializing TSN with base model: {}. TSN Configurations: input_modality: {} num_segments: {} new_length: {} consensus_module: {} dropout_ratio: {} """.format(base_model, self.modality, self.num_segments, self.new_length, consensus_type, self.dropout))) self._prepare_base_model(base_model) feature_dim = self._prepare_tsn(num_class) if self.modality == 'Flow': log("Converting the ImageNet model to a flow init model") self.base_model = self._construct_flow_model(self.base_model) log("Done. Flow model ready...") elif self.modality == 'RGBDiff': log("Converting the ImageNet model to RGB+Diff init model") self.base_model = self._construct_diff_model(self.base_model) log("Done. RGBDiff model ready.") self.consensus = ConsensusModule(consensus_type) if not self.before_softmax: self.softmax = nn.Softmax() self._enable_pbn = partial_bn if partial_bn: self.partialBN(True)
def validate(val_loader, model, criterion, iter, logger=None): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (input, target) in enumerate(val_loader): target = target.cuda(async=True) input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: log(('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5))) log(( 'Testing Results: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {loss.avg:.5f}' .format(top1=top1, top5=top5, loss=losses))) return top1.avg
def train(self, mode=True): """ Override the default train() to freeze the BN parameters 通过重写train方法实现PBN :return: """ super(TSN, self).train(mode) count = 0 if self._enable_pbn: log("Freezing BatchNorm2D except the first one.") for m in self.base_model.modules(): if isinstance(m, nn.BatchNorm2d): count += 1 if count >= (2 if self._enable_pbn else 1): m.eval() # shutdown update in frozen mode m.weight.requires_grad = False m.bias.requires_grad = False
def _load_image(self, directory, record, idx): if self.modality == 'RGB' or self.modality == 'RGBDiff': directory += record.path return [ Image.open(os.path.join( directory, self.image_tmpl.format(idx))).convert('RGB') ] elif self.modality == 'Flow': log('directory:', directory) log('idx:', idx) #每一次返回两张光流图分为x,y方向各一张。 x_img = Image.open( os.path.join(directory + 'u/' + record.path, self.image_tmpl.format(idx))).convert('L') y_img = Image.open( os.path.join(directory + 'v/' + record.path, self.image_tmpl.format(idx))).convert('L') return [x_img, y_img]
def blank_loglikes(n): a = ones((NUM_CHARS, n)) * 0.1 a[0, :] = 0.9 a /= sqrt(square(a).sum(axis=0)) return log(a)
def uniform_loglikes(n): return log(ones((NUM_CHARS, n)) / float(NUM_CHARS))
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t-1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k-1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k+1)].T, dhs[k+1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k-1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult(Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t-1] dhs[k][t-1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape((-1, 1)) / bsize return cost, self.grads
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t - 1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k - 1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k + 1)].T, dhs[k + 1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k - 1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult( Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t - 1] dhs[k][t - 1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape( (-1, 1)) / bsize return cost, self.grads
elif args.dataset == 'hmdb51': num_class = 51 elif args.dataset == 'kinetics': num_class = 400 else: raise ValueError('Unknown dataset ' + args.dataset) net = TSN(num_class, 1, args.modality, base_model=args.arch, consensus_type=args.crop_fusion_type, dropout=args.dropout) checkpoint = torch.load(args.weights) log("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)])
def main(): global args, best_prec1 args = parser.parse_args() if args.dataset == 'ucf101': num_class = 101 elif args.dataset == 'hmdb51': num_class = 51 elif args.dataset == 'kinetics': num_class = 400 else: raise ValueError('Unknown dataset ' + args.dataset) model = TSN(num_class, args.num_segments, args.modality, base_model=args.arch, consensus_type=args.consensus_type, dropout=args.dropout, partial_bn=not args.no_partialbn) #只返回网络结构 # state_dict = model_zoo.load_url(model_urls[args.arch],model_dir='./') # pretrained_dict = {} # for key,value in state_dict.items(): # pretrained_dict['base_model.'+key] = value # # model_dict = model.state_dict() # # #filter weight # pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # # log(pretrained_dict.keys()) # # model_dict.update(pretrained_dict) # # model.load_state_dict(model_dict) #模型加载预训练权重 crop_size = model.crop_size scale_size = model.scale_size input_mean = model.input_mean input_std = model.input_std policies = model.get_optim_policies() train_augmentation = model.get_augmentation() model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() #实现单机多卡的分布式训练。 if args.resume: if os.path.isfile(args.resume): log(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) log(("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch']))) else: log(("=> no checkpoint found at '{}'".format(args.resume))) cudnn.benchmark = True # Data loading code if args.modality != 'RGBDiff': normalize = GroupNormalize(input_mean, input_std) else: normalize = IdentityTransform() if args.modality == 'RGB': data_length = 1 #每一个segment采集的帧的数量。 elif args.modality in ['Flow', 'RGBDiff']: data_length = 5 train_loader = torch.utils.data.DataLoader( TSNDataSet( root_path="./jpegs_256/" if args.modality == 'RGB' else '../tvl1_flow/', list_file=args.train_list, num_segments=args.num_segments, #默认为3 new_length=data_length, modality=args.modality, # image_tmpl="frame{:06d}.jpg" if args.modality in ["RGB", "RGBDiff"] else args.flow_prefix+"{}_{:05d}.jpg", image_tmpl="frame{:06}.jpg", transform=torchvision.transforms.Compose([ train_augmentation, Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( TSNDataSet( root_path="./jpegs_256/" if args.modality == 'RGB' else '../tvl1_flow/', list_file=args.val_list, num_segments=args.num_segments, new_length=data_length, modality=args.modality, # image_tmpl="frame{:06d}.jpg" if args.modality in ["RGB", "RGBDiff"] else args.flow_prefix+"{}_{:05d}.jpg", image_tmpl="frame{:06}.jpg", random_shift=False, transform=torchvision.transforms.Compose([ GroupScale(int(scale_size)), GroupCenterCrop(crop_size), Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer if args.loss_type == 'nll': criterion = torch.nn.CrossEntropyLoss().cuda() else: raise ValueError("Unknown loss type") for group in policies: log(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) optimizer = torch.optim.SGD(policies, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: validate(val_loader, model, criterion, 0) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # break # evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: prec1 = validate(val_loader, model, criterion, (epoch + 1) * len(train_loader)) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best)
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() #转换到train模式,此模式也可以自己重写自己定义哪些层需要训练。 end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time #RGB 模态 # input 每一个sample应该是三张图片。 Input.size() ==> [16,9,224,224]具体请看model.py文件 # taeget.size() == > [16] #FLOW 模态 # Input 每一个sample应该是三个堆叠的光流图堆,Input.size() ==> [16,30,224,224] # taeget.size() ==> 16 batch_size=16,一个batch0总共有16段视频。故总共16个target. # log('input:',input.size()) # log('target:',target) data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) # break loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: log("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: log(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'])))