print 'MLP:', n_in_MLP, "->", net.hidden, "->", net.n_class, "" if use.depth: if net.n_convnets == 2: out = [x[:, :, 0, :, :, :], x[:, :, 1, :, :, :]] # 2 nets: body and hand # build 3D ConvNet layers = [] # all architecture layers insp = [] for stage in xrange(net.n_stages): for i in xrange(len(out)): # for body and hand # normalization if use.norm and stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", use_divisor=use.norm_div).output gray_norm = std_norm(gray_norm, axis=[-3, -2, -1]) depth_norm = var_norm(out[i][:, 1:]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) elif use.norm: out[i] = NormLayer(out[i], method="lcn", use_divisor=use.norm_div).output out[i] = std_norm(out[i], axis=[-3, -2, -1]) # convolutions out[i] *= net.scaler[stage][i] layers.append( ConvLayer( out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes)))
def build(): # constants floatX = config.floatX enum = enumerate file = gzip.GzipFile("params.zip", 'rb') params = load(file) file.close() print params W = [[None, None], [None, None], [None, None]] b = [[None, None], [None, None], [None, None]] W[0][0], b[0][0], W[0][1], b[0][1], W[1][0], b[1][0], W[1][1], b[1][1], W[ 2][0], b[2][0], W[2][1], b[2][1], Wh, bh, Ws, bs = params #-----------------------------FLIP KERNEL------------------------------------------ W = array(W) W_new = [[None, None], [None, None], [None, None]] for i in range(W.shape[0]): for j in range(W.shape[1]): w = W[i, j].get_value() print w.shape, w.dtype for k in range(w.shape[0]): for l in range(w.shape[1]): for m in range(w.shape[2]): w[k, l, m] = cv2.flip(w[k, l, m], -1) W_new[i][j] = shared(array(w, dtype=floatX), borrow=True) W = W_new #-----------------------------FLIP KERNEL------------------------------------------ rng = random.RandomState( 1337) # this will make sure results are always the same batch_size = 1 in_shape = (1, 2, 2, 32, 64, 64 ) # (batchsize, maps, frames, w, h) input video shapes traj_shape = (batch_size, 3, 32 ) # (batchsize, input shape of the trajectory # hyper parameters # ------------------------------------------------------------------------------ # use techniques/methods class use: drop = True # dropout depth = True # use depth map as input aug = False # data augmentation load = False # load params.p file traj = False # trajectory trajconv = False # convolutions on trajectory valid2 = False fast_conv = False norm_div = False norm = True # normalization layer mom = True # momentum # regularization class reg: L1_traj = .0 # degree/amount of regularization L2_traj = .0 # 1: only L1, 0: only L2 L1_vid = .0 # degree/amount of regularization L2_vid = .0 # 1: only L1, 0: only L2 class trajconv: append = False # append convolutions result to original traject filter_size = 5 layers = 3 # number of convolution layers res_shape = traj_shape[-1] - layers * (filter_size - 1) class net: shared_stages = [] # stages where weights are shared shared_convnets = [ ] # convnets that share weights ith beighbouring convnet n_convnets = 2 # number of convolutional networks in the architecture maps = [2, 16, 32, 64] # feature maps in each convolutional network # maps = [2,5,25,25] # feature maps in each convolutional network kernels = [(1, 7, 7), (1, 8, 8), (1, 6, 6)] # convolution kernel shapes pools = [(2, 2, 2), (2, 2, 2), (2, 2, 2)] # pool/subsampling shapes hidden_traj = 200 # hidden units in MLP hidden_vid = 300 # hidden units in MLP W_scale = 0.01 b_scale = 0.1 norm_method = "lcn" # normalisation method: lcn = local contrast normalisation pool_method = "max" # maxpool fusion = "early" # early or late fusion hidden = hidden_traj + hidden_vid if fusion == "late" else 500 # hidden units in MLP n_class = 21 activation = relu n_stages = len(net.kernels) video_shapes = [in_shape[-3:]] def _shared(val, borrow=True): return shared(array(val, dtype=floatX), borrow=borrow) def ndtensor(n): return TensorType(floatX, (False, ) * n) # n-dimensional tensor for i in xrange(n_stages): k, p, v = array(net.kernels[i]), array(net.pools[i]), array( video_shapes[i]) conv_s = tuple(v - k + 1) video_shapes.append(tuple((v - k + 1) / p)) n_in_MLP = net.maps[-1] * net.n_convnets * prod(video_shapes[-1]) def conv_args(stage, i): """ ConvLayer arguments, i: stage index """ args = { 'batch_size': 1, 'activation': activation, 'rng': rng, 'n_in_maps': net.maps[stage], 'n_out_maps': net.maps[stage + 1], 'kernel_shape': net.kernels[stage], 'video_shape': video_shapes[stage], "fast_conv": use.fast_conv, "layer_name": "Conv" + str(stage), "W_scale": net.W_scale, "b_scale": net.b_scale, "stride": 1, "W": W[stage][i], "b": b[stage][i] } return args # print conv_args(0,0) x = ndtensor(len(in_shape))(name='x') # video input def var_norm(_x, imgs=True, axis=[-3, -2, -1]): if imgs: return (_x - T.mean(_x, axis=axis, keepdims=True)) / T.maximum( 1e-4, T.std(_x, axis=axis, keepdims=True)) return (_x - T.mean(_x)) / T.maximum(1e-4, T.std(_x)) def std_norm(_x, axis=[-3, -2, -1]): return _x / T.maximum(1e-4, T.std(_x, axis=axis, keepdims=True)) out = [x[:, 0], x[:, 1]] for stage in xrange(n_stages): for i in xrange(len(out)): # for each convnet of the stage if stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", use_divisor=False).output gray_norm = std_norm(gray_norm) depth_norm = var_norm(out[i][:, 1:]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) else: out[i] = NormLayer(out[i], method="lcn", use_divisor=False).output out[i] = std_norm(out[i]) out[i] = ConvLayer(out[i], **conv_args(stage, i)).output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output out = [out[i].flatten(2) for i in range(len(out))] out = T.concatenate(out, axis=1) #hidden layer out = HiddenLayer(out, W=Wh, b=bh, n_in=n_in_MLP, n_out=net.hidden, rng=rng, activation=activation).output logreg = LogRegr(out, W=Ws, b=bs, rng=rng, activation=activation, n_in=net.hidden, n_out=net.n_class) pred = logreg.p_y_given_x x_ = _shared(empty(in_shape)) print "compiling..." eval_model = function([], [pred], givens={x: x_}, on_unused_input='ignore') print "compiling done" return eval_model, x_
for stage in xrange(n_stages): for i in xrange(len(out)): # for each convnet of the stage # normalization # if use.norm and stage==0: # gray_norm = NormLayer(out[i][:,0:1], method="lcn", # use_divisor=use.norm_div).output # gray_norm = std_norm(gray_norm,axis=[-3,-2,-1]) # depth_norm = var_norm(out[i][:,1:]) # out[i] = T.concatenate([gray_norm,depth_norm],axis=1) # elif use.norm: # out[i] = NormLayer(out[i], method="lcn",use_divisor=use.norm_div).output # # out[i] = std_norm(out[i],axis=[-3,-2,-1]) # else: if stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", kernel_size=9, use_divisor=use.norm_div).output # gray_norm = std_norm(gray_norm,axis=[-3,-2,-1]) depth_norm = var_norm(out[i][:, 1:2]) # user_norm = var_norm(out[i][:,2:3]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) # out[i] = T.concatenate([gray_norm,depth_norm, user_norm],axis=1) # out[i] = mean_sub(out[i],axis=[-3,-2,-1]) # out[i] = std_norm(out[i],axis=[-4,-3,-2,-1]) # out[i] = var_norm(out[i],axis=[-3,-2,-1]) elif stage == 1: out[i] = NormLayer(out[i], method="lcn",
else: if net.n_convnets == 1: out = [x[:, 0, 0:1]] else: out = [x[:, 0, 0:1], x[:, 1, 0:1]] # 2 nets without depth: left and right # build 3D ConvNet for stage in xrange(n_stages): for i in xrange(len(out)): # for each convnet of the stage # convolutions layers.append(ConvLayer(out[i], **conv_args(stage, i))) out[i] = layers[-1].output # normalization if use.norm: out[i] = NormLayer(out[i], method=net.norm_method, use_divisor=use.norm_div).output # pooling, subsamping out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output # out[i] = normalize(out[i]) # flatten all convnets outputs out = [out[i].flatten(2) for i in range(len(out))] vid_ = T.concatenate(out, axis=1) debug = vid_ # traject convolution # ------------------------------------------------------------------------------
def build(): use.load = True # we load the CNN parameteres here x = ndtensor(len(tr.in_shape))(name='x') # video input x_ = _shared(empty(tr.in_shape)) conv_shapes = [] for i in xrange(net.n_stages): k, p, v = array(net.kernels[i]), array(net.pools[i]), array( tr.video_shapes[i]) conv_s = tuple(v - k + 1) conv_shapes.append(conv_s) tr.video_shapes.append(tuple((v - k + 1) / p)) print "stage", i print " conv", tr.video_shapes[i], "->", conv_s print " pool", conv_s, "->", tr.video_shapes[i + 1], "x", net.maps[i + 1] # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1]) print 'MLP:', n_in_MLP, "->", net.hidden, "->", net.n_class, "" if use.depth: if net.n_convnets == 2: out = [x[:, :, 0, :, :, :], x[:, :, 1, :, :, :]] # 2 nets: body and hand # build 3D ConvNet layers = [] # all architecture layers insp = [] for stage in xrange(net.n_stages): for i in xrange(len(out)): # for body and hand # normalization if use.norm and stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", use_divisor=use.norm_div).output gray_norm = std_norm(gray_norm, axis=[-3, -2, -1]) depth_norm = var_norm(out[i][:, 1:]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) elif use.norm: out[i] = NormLayer(out[i], method="lcn", use_divisor=use.norm_div).output out[i] = std_norm(out[i], axis=[-3, -2, -1]) # convolutions out[i] *= net.scaler[stage][i] layers.append( ConvLayer( out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes))) out[i] = layers[-1].output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output if tr.inspect: insp.append(T.mean(out[i])) # flatten all convnets outputs for i in xrange(len(out)): out[i] = std_norm(out[i], axis=[-3, -2, -1]) out = [out[i].flatten(2) for i in range(len(out))] vid_ = T.concatenate(out, axis=1) # dropout if use.drop: drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) drop.p_vid.set_value(float32(0.)) # dont use dropout when testing drop.p_hidden.set_value(float32(0.)) # dont use dropout when testing vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_vid).output # MLP # ------------------------------------------------------------------------------ # fusion if net.fusion == "early": out = vid_ # hidden layer Wh, bh = load_params(use) # This is test, wudi added this! layers.append( HiddenLayer(out, W=Wh, b=bh, n_in=n_in_MLP, n_out=net.hidden, rng=tr.rng, W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=relu)) out = layers[-1].output if tr.inspect: insp = T.stack(insp[0], insp[1], insp[2], insp[3], insp[4], insp[5], T.mean(out)) else: insp = T.stack(0, 0) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output #maxout # softmax layer Ws, bs = load_params(use) # This is test, wudi added this! layers.append( LogRegr(out, W=Ws, b=bs, rng=tr.rng, activation=lin, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # prediction y_pred = layers[-1].y_pred p_y_given_x = layers[-1].p_y_given_x #################################################################### #################################################################### print "\n%s\n\tcompiling\n%s" % (('-' * 30, ) * 2) #################################################################### #################################################################### # compile functions # ------------------------------------------------------------------------------ print 'compiling test_model' eval_model = function([], [y_pred, p_y_given_x], givens={x: x_}, on_unused_input='ignore') return eval_model, x_
def __init__(self, x, use, lr, batch, net, reg, drop, mom, tr, res_dir, load_path=""): self.out = [] self.layers = [] self.insp_mean = [] self.insp_std = [] for c in (use, lr, batch, net, reg, drop, mom, tr): write(c.__name__ + ":", res_dir) _s = c.__dict__ del _s['__module__'], _s['__doc__'] for key in _s.keys(): val = str(_s[key]) if val.startswith("<static"): val = str(_s[key].__func__.__name__) if val.startswith("<Cuda"): continue if val.startswith("<Tensor"): continue write(" " + key + ": " + val, res_dir) #################################################################### #################################################################### print "\n%s\n\tbuilding\n%s" % (('-' * 30, ) * 2) #################################################################### #################################################################### # ConvNet # ------------------------------------------------------------------------------ # calculate resulting video shapes for all stages print net.n_stages conv_shapes = [] for i in xrange(net.n_stages): k, p, v = array(net.kernels[i]), array(net.pools[i]), array( tr.video_shapes[i]) conv_s = tuple(v - k + 1) conv_shapes.append(conv_s) tr.video_shapes.append(tuple((v - k + 1) / p)) print "stage", i if use.depth and i == 0: print " conv", tr.video_shapes[ i], "x 2 ->", conv_s #for body and hand else: print " conv", tr.video_shapes[i], "->", conv_s print " pool", conv_s, "->", tr.video_shapes[i + 1], "x", net.maps[i + 1] # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1]) print 'debug1' if use.depth: if net.n_convnets == 2: out = [x[:, :, 0, :, :, :], x[:, :, 1, :, :, :]] # 2 nets: body and hand # build 3D ConvNet for stage in xrange(net.n_stages): for i in xrange(len(out)): # for body and hand # normalization if use.norm and stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", use_divisor=use.norm_div).output gray_norm = std_norm(gray_norm, axis=[-3, -2, -1]) depth_norm = var_norm(out[i][:, 1:]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) elif use.norm: out[i] = NormLayer(out[i], method="lcn", use_divisor=use.norm_div).output out[i] = std_norm(out[i], axis=[-3, -2, -1]) # convolutions out[i] *= net.scaler[stage][i] print 'debug2' self.layers.append( ConvLayer( out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes, load_path))) out[i] = self.layers[-1].output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output if tr.inspect: self.insp_mean.append(T.mean(out[i])) self.insp_std.append(T.std(out[i])) print 'debug2' # flatten all convnets outputs for i in xrange(len(out)): out[i] = std_norm(out[i], axis=[-3, -2, -1]) out = [out[i].flatten(2) for i in range(len(out))] vid_ = T.concatenate(out, axis=1) print 'debug3' # dropout if use.drop: drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_hidden).output #maxout if use.maxout: vid_ = maxout(vid_, (batch.micro, n_in_MLP)) net.activation = lin n_in_MLP /= 2 # net.hidden *= 2 # MLP # ------------------------------------------------------------------------------ # fusion if net.fusion == "early": out = vid_ # hidden layer if use.load: W, b = load_params(use, load_path) self.layers.append( HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden_vid, rng=tr.rng, W=W, b=b, W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=net.activation)) else: self.layers.append( HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden_vid, rng=tr.rng, W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=net.activation)) out = self.layers[-1].output #if tr.inspect: #self.insp_mean = T.stack(self.insp_mean) #self.insp_std = T.stack(self.insp_std) #self.insp = T.stack(self.insp[0],self.insp[1],self.insp[2],self.insp[3],self.insp[4],self.insp[5], T.mean(out)) #else: self.insp = T.stack(0,0) # out = normalize(out) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output #maxout if use.maxout: out = maxout(out, (batch.micro, net.hidden)) net.hidden /= 2 print 'debug3' # now assembly all the output self.out = out self.n_in_MLP = n_in_MLP
# if net.n_convnets==2: out = [x[:,0], x[:,1]] # 2 nets: left and right # else: out = [x[:,0,0:1], x[:,0,1:2], x[:,1,0:1], x[:,1,1:2]] # 4 nets # else: out = [x[:,0,0:1], x[:,1,1:2]] # 2 nets without depth: left and right out = [x[:, 0]] # build 3D ConvNet for stage in xrange(n_stages): for i in xrange(len(out)): # for each convnet of the stage # convolutions layers.append(ConvLayer(out[i], **conv_args(stage, i))) out[i] = layers[-1].output # normalization if use.norm: out[i] = NormLayer(out[i], method=net.norm_method, use_divisor=False).output # pooling, subsamping out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output # flatten all convnets outputs # out = [out[i].flatten(2) for i in range(len(out))] # vid_ = T.concatenate(out, axis=1) vid_ = T.concatenate([out[0].flatten(2)], axis=1) # traject convolution # ------------------------------------------------------------------------------ if use.trajconv: t_conv = t.reshape((prod(traj_shape[:-1]), 1, 1, traj_shape[-1]))