Exemplo n.º 1
0
def build():

    # constants
    floatX = config.floatX
    enum = enumerate

    file = gzip.GzipFile("params.zip", 'rb')
    params = load(file)
    file.close()
    print params
    W = [[None, None], [None, None], [None, None]]
    b = [[None, None], [None, None], [None, None]]
    W[0][0], b[0][0], W[0][1], b[0][1], W[1][0], b[1][0], W[1][1], b[1][1], W[
        2][0], b[2][0], W[2][1], b[2][1], Wh, bh, Ws, bs = params

    #-----------------------------FLIP KERNEL------------------------------------------
    W = array(W)
    W_new = [[None, None], [None, None], [None, None]]
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            w = W[i, j].get_value()
            print w.shape, w.dtype
            for k in range(w.shape[0]):
                for l in range(w.shape[1]):
                    for m in range(w.shape[2]):
                        w[k, l, m] = cv2.flip(w[k, l, m], -1)
            W_new[i][j] = shared(array(w, dtype=floatX), borrow=True)
    W = W_new
    #-----------------------------FLIP KERNEL------------------------------------------

    rng = random.RandomState(
        1337)  # this will make sure results are always the same
    batch_size = 1

    in_shape = (1, 2, 2, 32, 64, 64
                )  # (batchsize, maps, frames, w, h) input video shapes
    traj_shape = (batch_size, 3, 32
                  )  # (batchsize, input shape of the trajectory

    # hyper parameters
    # ------------------------------------------------------------------------------

    # use techniques/methods
    class use:
        drop = True  # dropout
        depth = True  # use depth map as input
        aug = False  # data augmentation
        load = False  # load params.p file
        traj = False  # trajectory
        trajconv = False  # convolutions on trajectory
        valid2 = False
        fast_conv = False
        norm_div = False

        norm = True  # normalization layer
        mom = True  # momentum

    # regularization
    class reg:
        L1_traj = .0  # degree/amount of regularization
        L2_traj = .0  # 1: only L1, 0: only L2
        L1_vid = .0  # degree/amount of regularization
        L2_vid = .0  # 1: only L1, 0: only L2

    class trajconv:
        append = False  # append convolutions result to original traject
        filter_size = 5
        layers = 3  # number of convolution layers
        res_shape = traj_shape[-1] - layers * (filter_size - 1)

    class net:
        shared_stages = []  # stages where weights are shared
        shared_convnets = [
        ]  # convnets that share weights ith beighbouring convnet
        n_convnets = 2  # number of convolutional networks in the architecture
        maps = [2, 16, 32, 64]  # feature maps in each convolutional network
        # maps = [2,5,25,25] # feature maps in each convolutional network
        kernels = [(1, 7, 7), (1, 8, 8),
                   (1, 6, 6)]  # convolution kernel shapes
        pools = [(2, 2, 2), (2, 2, 2), (2, 2, 2)]  # pool/subsampling shapes
        hidden_traj = 200  # hidden units in MLP
        hidden_vid = 300  # hidden units in MLP
        W_scale = 0.01
        b_scale = 0.1
        norm_method = "lcn"  # normalisation method: lcn = local contrast normalisation
        pool_method = "max"  # maxpool
        fusion = "early"  # early or late fusion
        hidden = hidden_traj + hidden_vid if fusion == "late" else 500  # hidden units in MLP
        n_class = 21

    activation = relu
    n_stages = len(net.kernels)
    video_shapes = [in_shape[-3:]]

    def _shared(val, borrow=True):
        return shared(array(val, dtype=floatX), borrow=borrow)

    def ndtensor(n):
        return TensorType(floatX, (False, ) * n)  # n-dimensional tensor

    for i in xrange(n_stages):
        k, p, v = array(net.kernels[i]), array(net.pools[i]), array(
            video_shapes[i])
        conv_s = tuple(v - k + 1)
        video_shapes.append(tuple((v - k + 1) / p))
    n_in_MLP = net.maps[-1] * net.n_convnets * prod(video_shapes[-1])

    def conv_args(stage, i):
        """ ConvLayer arguments, i: stage index """
        args = {
            'batch_size': 1,
            'activation': activation,
            'rng': rng,
            'n_in_maps': net.maps[stage],
            'n_out_maps': net.maps[stage + 1],
            'kernel_shape': net.kernels[stage],
            'video_shape': video_shapes[stage],
            "fast_conv": use.fast_conv,
            "layer_name": "Conv" + str(stage),
            "W_scale": net.W_scale,
            "b_scale": net.b_scale,
            "stride": 1,
            "W": W[stage][i],
            "b": b[stage][i]
        }
        return args

    # print conv_args(0,0)
    x = ndtensor(len(in_shape))(name='x')  # video input

    def var_norm(_x, imgs=True, axis=[-3, -2, -1]):
        if imgs:
            return (_x - T.mean(_x, axis=axis, keepdims=True)) / T.maximum(
                1e-4, T.std(_x, axis=axis, keepdims=True))
        return (_x - T.mean(_x)) / T.maximum(1e-4, T.std(_x))

    def std_norm(_x, axis=[-3, -2, -1]):
        return _x / T.maximum(1e-4, T.std(_x, axis=axis, keepdims=True))

    out = [x[:, 0], x[:, 1]]

    for stage in xrange(n_stages):
        for i in xrange(len(out)):  # for each convnet of the stage
            if stage == 0:
                gray_norm = NormLayer(out[i][:, 0:1],
                                      method="lcn",
                                      use_divisor=False).output
                gray_norm = std_norm(gray_norm)
                depth_norm = var_norm(out[i][:, 1:])
                out[i] = T.concatenate([gray_norm, depth_norm], axis=1)
            else:
                out[i] = NormLayer(out[i], method="lcn",
                                   use_divisor=False).output
                out[i] = std_norm(out[i])
            out[i] = ConvLayer(out[i], **conv_args(stage, i)).output
            out[i] = PoolLayer(out[i],
                               net.pools[stage],
                               method=net.pool_method).output

    out = [out[i].flatten(2) for i in range(len(out))]
    out = T.concatenate(out, axis=1)

    #hidden layer
    out = HiddenLayer(out,
                      W=Wh,
                      b=bh,
                      n_in=n_in_MLP,
                      n_out=net.hidden,
                      rng=rng,
                      activation=activation).output

    logreg = LogRegr(out,
                     W=Ws,
                     b=bs,
                     rng=rng,
                     activation=activation,
                     n_in=net.hidden,
                     n_out=net.n_class)

    pred = logreg.p_y_given_x

    x_ = _shared(empty(in_shape))

    print "compiling..."
    eval_model = function([], [pred], givens={x: x_}, on_unused_input='ignore')
    print "compiling done"

    return eval_model, x_
Exemplo n.º 2
0
    "W_scale": net.W_scale[-2],
    "b_scale": net.b_scale[-2],
    "n_in": n_in_MLP,
    "n_out": net.hidden
}

# fusion
if net.fusion == "early":
    if use.load:
        args["W"], args["b"] = load_params()
    if use.traj:
        out = T.concatenate([vid_, traj_], axis=1)
    else:
        out = vid_
    # hidden layer
    layers.append(HiddenLayer(out, **args))
    out = layers[-1].output

else:  # late fusion
    if use.load:
        args["W"], args["b"] = load_params()
    args["n_in"] -= net.maps[-1] * net.n_convnets * prod(video_shapes[-1])
    args["activation"] = tanh
    args["n_out"] = net.hidden_traj
    layers.append(HiddenLayer(traj_, **args))

    if use.load:
        args["W"], args["b"] = load_params()
    args["n_in"] = net.maps[-1] * net.n_convnets * prod(video_shapes[-1])
    args["activation"] = relu
    args["n_out"] = net.hidden_vid
Exemplo n.º 3
0
    drop.p_hidden.set_value(float32(0.))  # dont use dropout when testing
    vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_vid).output

# MLP
# ------------------------------------------------------------------------------
# fusion
if net.fusion == "early":
    out = vid_
    # hidden layer
    Wh, bh = load_params(use)  # This is test, wudi added this!
    layers.append(
        HiddenLayer(out,
                    W=Wh,
                    b=bh,
                    n_in=n_in_MLP,
                    n_out=net.hidden,
                    rng=tr.rng,
                    W_scale=net.W_scale[-2],
                    b_scale=net.b_scale[-2],
                    activation=relu))
    out = layers[-1].output

if tr.inspect:
    insp = T.stack(insp[0], insp[1], insp[2], insp[3], insp[4], insp[5],
                   T.mean(out))
else:
    insp = T.stack(0, 0)

if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
#maxout
# softmax layer
def build():
    use.load = True  # we load the CNN parameteres here
    x = ndtensor(len(tr.in_shape))(name='x')  # video input
    x_ = _shared(empty(tr.in_shape))

    conv_shapes = []
    for i in xrange(net.n_stages):
        k, p, v = array(net.kernels[i]), array(net.pools[i]), array(
            tr.video_shapes[i])
        conv_s = tuple(v - k + 1)
        conv_shapes.append(conv_s)
        tr.video_shapes.append(tuple((v - k + 1) / p))
        print "stage", i
        print "  conv", tr.video_shapes[i], "->", conv_s
        print "  pool", conv_s, "->", tr.video_shapes[i + 1], "x", net.maps[i +
                                                                            1]

    # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size
    n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1])
    print 'MLP:', n_in_MLP, "->", net.hidden, "->", net.n_class, ""

    if use.depth:
        if net.n_convnets == 2:
            out = [x[:, :, 0, :, :, :], x[:, :,
                                          1, :, :, :]]  # 2 nets: body and hand

    # build 3D ConvNet
    layers = []  # all architecture layers
    insp = []
    for stage in xrange(net.n_stages):
        for i in xrange(len(out)):  # for body and hand
            # normalization
            if use.norm and stage == 0:
                gray_norm = NormLayer(out[i][:, 0:1],
                                      method="lcn",
                                      use_divisor=use.norm_div).output
                gray_norm = std_norm(gray_norm, axis=[-3, -2, -1])
                depth_norm = var_norm(out[i][:, 1:])
                out[i] = T.concatenate([gray_norm, depth_norm], axis=1)
            elif use.norm:
                out[i] = NormLayer(out[i],
                                   method="lcn",
                                   use_divisor=use.norm_div).output
                out[i] = std_norm(out[i], axis=[-3, -2, -1])
            # convolutions
            out[i] *= net.scaler[stage][i]
            layers.append(
                ConvLayer(
                    out[i],
                    **conv_args(stage, i, batch, net, use, tr.rng,
                                tr.video_shapes)))
            out[i] = layers[-1].output
            out[i] = PoolLayer(out[i],
                               net.pools[stage],
                               method=net.pool_method).output
            if tr.inspect: insp.append(T.mean(out[i]))

    # flatten all convnets outputs
    for i in xrange(len(out)):
        out[i] = std_norm(out[i], axis=[-3, -2, -1])
    out = [out[i].flatten(2) for i in range(len(out))]
    vid_ = T.concatenate(out, axis=1)

    # dropout
    if use.drop:
        drop.p_vid = shared(float32(drop.p_vid_val))
        drop.p_hidden = shared(float32(drop.p_hidden_val))
        drop.p_vid.set_value(float32(0.))  # dont use dropout when testing
        drop.p_hidden.set_value(float32(0.))  # dont use dropout when testing
        vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_vid).output

    # MLP
    # ------------------------------------------------------------------------------
    # fusion
    if net.fusion == "early":
        out = vid_
        # hidden layer
        Wh, bh = load_params(use)  # This is test, wudi added this!
        layers.append(
            HiddenLayer(out,
                        W=Wh,
                        b=bh,
                        n_in=n_in_MLP,
                        n_out=net.hidden,
                        rng=tr.rng,
                        W_scale=net.W_scale[-2],
                        b_scale=net.b_scale[-2],
                        activation=relu))
        out = layers[-1].output

    if tr.inspect:
        insp = T.stack(insp[0], insp[1], insp[2], insp[3], insp[4], insp[5],
                       T.mean(out))
    else:
        insp = T.stack(0, 0)

    if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
    #maxout
    # softmax layer
    Ws, bs = load_params(use)  # This is test, wudi added this!
    layers.append(
        LogRegr(out,
                W=Ws,
                b=bs,
                rng=tr.rng,
                activation=lin,
                n_in=net.hidden,
                W_scale=net.W_scale[-1],
                b_scale=net.b_scale[-1],
                n_out=net.n_class))
    """
    layers[-1] : softmax layer
    layers[-2] : hidden layer (video if late fusion)
    layers[-3] : hidden layer (trajectory, only if late fusion)
    """
    # prediction
    y_pred = layers[-1].y_pred
    p_y_given_x = layers[-1].p_y_given_x
    ####################################################################
    ####################################################################
    print "\n%s\n\tcompiling\n%s" % (('-' * 30, ) * 2)
    ####################################################################
    ####################################################################
    # compile functions
    # ------------------------------------------------------------------------------
    print 'compiling test_model'

    eval_model = function([], [y_pred, p_y_given_x],
                          givens={x: x_},
                          on_unused_input='ignore')

    return eval_model, x_
Exemplo n.º 5
0
# MLP
# ------------------------------------------------------------------------------

# fusion
if net.fusion == "early":
    if use.traj:
        out = T.concatenate([vid_, traj_], axis=1)
    else:
        out = vid_
    # hidden layer
    layers.append(
        HiddenLayer(out,
                    n_in=n_in_MLP,
                    n_out=net.hidden,
                    rng=rng,
                    W_scale=net.W_scale,
                    b_scale=net.b_scale,
                    activation=activation))
    out = layers[-1].output
else:  # late fusion
    n_in_MLP -= net.maps[-1] * net.n_convnets * prod(video_shapes[-1])
    layers.append(
        HiddenLayer(traj_,
                    n_in=n_in_MLP,
                    n_out=net.hidden_traj,
                    rng=rng,
                    W_scale=net.W_scale,
                    b_scale=net.b_scale,
                    activation=activation))
    n_in_MLP = net.maps[-1] * net.n_convnets * prod(video_shapes[-1])
Exemplo n.º 6
0
    def __init__(self,
                 x,
                 use,
                 lr,
                 batch,
                 net,
                 reg,
                 drop,
                 mom,
                 tr,
                 res_dir,
                 load_path=""):

        self.out = []
        self.layers = []
        self.insp_mean = []
        self.insp_std = []

        for c in (use, lr, batch, net, reg, drop, mom, tr):
            write(c.__name__ + ":", res_dir)
            _s = c.__dict__
            del _s['__module__'], _s['__doc__']
            for key in _s.keys():
                val = str(_s[key])
                if val.startswith("<static"):
                    val = str(_s[key].__func__.__name__)
                if val.startswith("<Cuda"): continue
                if val.startswith("<Tensor"): continue
                write("  " + key + ": " + val, res_dir)

        ####################################################################
        ####################################################################
        print "\n%s\n\tbuilding\n%s" % (('-' * 30, ) * 2)
        ####################################################################
        ####################################################################
        # ConvNet
        # ------------------------------------------------------------------------------
        # calculate resulting video shapes for all stages
        print net.n_stages
        conv_shapes = []
        for i in xrange(net.n_stages):
            k, p, v = array(net.kernels[i]), array(net.pools[i]), array(
                tr.video_shapes[i])
            conv_s = tuple(v - k + 1)
            conv_shapes.append(conv_s)
            tr.video_shapes.append(tuple((v - k + 1) / p))
            print "stage", i
            if use.depth and i == 0:
                print "  conv", tr.video_shapes[
                    i], "x 2 ->", conv_s  #for body and hand
            else:
                print "  conv", tr.video_shapes[i], "->", conv_s
            print "  pool", conv_s, "->", tr.video_shapes[i +
                                                          1], "x", net.maps[i +
                                                                            1]

        # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size
        n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1])
        print 'debug1'
        if use.depth:
            if net.n_convnets == 2:
                out = [x[:, :, 0, :, :, :],
                       x[:, :, 1, :, :, :]]  # 2 nets: body and hand

        # build 3D ConvNet

        for stage in xrange(net.n_stages):
            for i in xrange(len(out)):  # for body and hand
                # normalization
                if use.norm and stage == 0:
                    gray_norm = NormLayer(out[i][:, 0:1],
                                          method="lcn",
                                          use_divisor=use.norm_div).output
                    gray_norm = std_norm(gray_norm, axis=[-3, -2, -1])
                    depth_norm = var_norm(out[i][:, 1:])
                    out[i] = T.concatenate([gray_norm, depth_norm], axis=1)
                elif use.norm:
                    out[i] = NormLayer(out[i],
                                       method="lcn",
                                       use_divisor=use.norm_div).output
                    out[i] = std_norm(out[i], axis=[-3, -2, -1])
                # convolutions
                out[i] *= net.scaler[stage][i]
                print 'debug2'
                self.layers.append(
                    ConvLayer(
                        out[i],
                        **conv_args(stage, i, batch, net, use, tr.rng,
                                    tr.video_shapes, load_path)))
                out[i] = self.layers[-1].output
                out[i] = PoolLayer(out[i],
                                   net.pools[stage],
                                   method=net.pool_method).output
                if tr.inspect:
                    self.insp_mean.append(T.mean(out[i]))
                    self.insp_std.append(T.std(out[i]))
        print 'debug2'
        # flatten all convnets outputs
        for i in xrange(len(out)):
            out[i] = std_norm(out[i], axis=[-3, -2, -1])
        out = [out[i].flatten(2) for i in range(len(out))]
        vid_ = T.concatenate(out, axis=1)
        print 'debug3'
        # dropout
        if use.drop:
            drop.p_vid = shared(float32(drop.p_vid_val))
            drop.p_hidden = shared(float32(drop.p_hidden_val))
            vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_hidden).output

        #maxout
        if use.maxout:
            vid_ = maxout(vid_, (batch.micro, n_in_MLP))
            net.activation = lin
            n_in_MLP /= 2
            # net.hidden *= 2

        # MLP
        # ------------------------------------------------------------------------------
        # fusion
        if net.fusion == "early":
            out = vid_
            # hidden layer
            if use.load:
                W, b = load_params(use, load_path)
                self.layers.append(
                    HiddenLayer(out,
                                n_in=n_in_MLP,
                                n_out=net.hidden_vid,
                                rng=tr.rng,
                                W=W,
                                b=b,
                                W_scale=net.W_scale[-2],
                                b_scale=net.b_scale[-2],
                                activation=net.activation))
            else:
                self.layers.append(
                    HiddenLayer(out,
                                n_in=n_in_MLP,
                                n_out=net.hidden_vid,
                                rng=tr.rng,
                                W_scale=net.W_scale[-2],
                                b_scale=net.b_scale[-2],
                                activation=net.activation))
            out = self.layers[-1].output

        #if tr.inspect:
        #self.insp_mean = T.stack(self.insp_mean)
        #self.insp_std = T.stack(self.insp_std)
        #self.insp = T.stack(self.insp[0],self.insp[1],self.insp[2],self.insp[3],self.insp[4],self.insp[5], T.mean(out))
        #else: self.insp =  T.stack(0,0)
        # out = normalize(out)
        if use.drop:
            out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
        #maxout
        if use.maxout:
            out = maxout(out, (batch.micro, net.hidden))
            net.hidden /= 2
        print 'debug3'

        # now assembly all the output
        self.out = out
        self.n_in_MLP = n_in_MLP
    def __init__(self, res_dir, load_path):

        self.layers = []  # only contain the layers from fusion
        self.insp_mean = []  # inspection for each layer mean activation
        self.insp_std = []  # inspection for each layer std activation
        self.params = []  # parameter list
        self.idx_mini = T.lscalar(name="idx_mini")  # minibatch index
        self.idx_micro = T.lscalar(name="idx_micro")  # microbatch index

        # symbolic variables
        self.x = ndtensor(len(tr.in_shape))(name='x')  # video input
        self.y = T.ivector(name='y')  # labels
        # symbolic variables
        self.x_skeleton = ndtensor(len(tr._skeleon_in_shape))(
            name='x_skeleton')  # video input

        if use.drop:
            drop.p_vid = shared(float32(drop.p_vid_val))
            drop.p_hidden = shared(float32(drop.p_hidden_val))
        video_cnn = conv3d_chalearn(self.x, use, lr, batch, net, reg, drop, mom, \
                                             tr, res_dir, load_path)

        dbn = GRBM_DBN(numpy_rng=random.RandomState(123), n_ins=891, \
                hidden_layers_sizes=[2000, 2000, 1000], n_outs=101, input_x=self.x_skeleton, label=self.y )
        # we load the pretrained DBN skeleton parameteres here
        if use.load == True:
            dbn.load(os.path.join(load_path, 'dbn_2015-06-19-11-34-24.npy'))

        #####################################################################
        # fuse the ConvNet output with skeleton output  -- need to change here
        ######################################################################
        out = T.concatenate([video_cnn.out, dbn.sigmoid_layers[-1].output],
                            axis=1)

        #####################################################################
        # wudi add the mean and standard deviation of the activation values to exam the neural net
        # Reference: Understanding the difficulty of training deep feedforward neural networks, Xavier Glorot, Yoshua Bengio
        #####################################################################
        insp_mean_list = []
        insp_std_list = []
        insp_mean_list.extend(dbn.out_mean)
        insp_mean_list.extend(video_cnn.insp_mean)
        insp_std_list.extend(dbn.out_std)
        insp_std_list.extend(video_cnn.insp_std)

        ######################################################################
        #MLP layer
        self.layers.append(
            HiddenLayer(out,
                        n_in=net.hidden,
                        n_out=net.hidden,
                        rng=tr.rng,
                        W_scale=net.W_scale[-1],
                        b_scale=net.b_scale[-1],
                        activation=net.activation))
        out = self.layers[-1].output

        if tr.inspect:
            insp_mean_list.extend([T.mean(out)])
            insp_std_list.extend([T.std(out)])
        self.insp_mean = T.stacklists(insp_mean_list)
        self.insp_std = T.stacklists(insp_std_list)

        if use.drop:
            out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output

        ######################################################################
        # softmax layer
        self.layers.append(
            LogRegr(out,
                    rng=tr.rng,
                    n_in=net.hidden,
                    W_scale=net.W_scale[-1],
                    b_scale=net.b_scale[-1],
                    n_out=net.n_class))

        self.p_y_given_x = self.layers[-1].p_y_given_x
        ######################################################################
        # cost function
        self.cost = self.layers[-1].negative_log_likelihood(self.y)

        # function computing the number of errors
        self.errors = self.layers[-1].errors(self.y)

        # parameter list
        for layer in video_cnn.layers:
            self.params.extend(layer.params)

        # pre-trained dbn parameter last layer  (W, b) doesn't need to incorporate into the params
        # for calculating the gradient
        self.params.extend(dbn.params[:-2])

        # MLP hidden layer params
        self.params.extend(self.layers[-2].params)
        # softmax layer params
        self.params.extend(self.layers[-1].params)
        # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size
        print 'MLP:', video_cnn.n_in_MLP, "->", net.hidden_penultimate, "+", net.hidden_traj, '->', \
           net.hidden, '->', net.hidden, '->', net.n_class, ""

        return
Exemplo n.º 8
0
	def __init__(self, numpy_rng, batch_size, n_outs,conv_layer_configs, hidden_layer_configs, 
			conv_activation = T.nnet.sigmoid,hidden_activation = T.nnet.sigmoid):

		self.layers = []
		self.finetune_cost = None
		self.params = [];
		self.delta_params = [];
		self.n_layers = 0;
		self.type = None;
		self.mlp_layer_start = 0 
		
		#placeholders
		self.output = None
		self.features = None
		self.features_dim = None
		self.errors = None
		self.finetune_cost = None

		# allocate symbolic variables for the data
		self.x = tensor5('x')  
		self.y = T.ivector('y')

		self.conv_layer_num = len(conv_layer_configs) 	#counting number of convolution layers
		self.hidden_layer_num = len(hidden_layer_configs['hidden_layers'])
		self.mlp_layer_start = self.hidden_layer_num;

		self.mlp_layers = []
		self.conv_layers = []
		self.pool_layers = []
	
			            
		for i in xrange(self.conv_layer_num):		# construct the convolution layer
			
			if i == 0:  				#is_input layer
				input = self.x
				is_input_layer = True
			else:
				input = self.layers[-1].output #output of previous layer
				is_input_layer = False
			
			config = conv_layer_configs[i]
			conv_layer = ConvLayer(input=input,
				n_in_maps=config['n_in_maps'], n_out_maps=config['n_out_maps'],
				kernel_shape=config['kernel_shape'],video_shape=config['video_shape'],
				batch_size=batch_size,
				numpy_rng=numpy_rng,activation=conv_activation);
			
			self.layers.append(conv_layer)
			self.conv_layers.append(conv_layer)
			pool_layer = PoolLayer(conv_layer.output, pool_shape=config['poolsize']);
			self.layers.append(pool_layer)
			self.pool_layers.append(conv_layer)
			
			if config['update']==True:	# only few layers of convolution layer are considered for updation
				self.params.extend(conv_layer.params)
				self.delta_params.extend(conv_layer.delta_params)

		hidden_layers = hidden_layer_configs['hidden_layers'];
		self.conv_output_dim = (config['n_out_maps'] * numpy.prod(config['output_shape']))
		
		#flattening the last convolution output layer
		self.features = self.conv_layers[-1].output.flatten(2);
		self.features_dim = self.conv_output_dim;
		
		for i in xrange(self.hidden_layer_num):		# construct the hidden layer
			if i == 0:				# is first sigmoidla layer
				input_size = self.conv_output_dim
				layer_input = self.features
			else:
				input_size = hidden_layers[i - 1]	# number of hidden neurons in previous layers
				layer_input = self.layers[-1].output
			

			sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input,n_in=input_size,
				n_out = hidden_layers[i], activation=hidden_activation);
						
			self.layers.append(sigmoid_layer)
			self.mlp_layers.append(sigmoid_layer)

			if config['update']==True:	# only few layers of hidden layer are considered for updation
                		self.params.extend(sigmoid_layer.params)
                		self.delta_params.extend(sigmoid_layer.delta_params)
           

		self.logLayer = LogisticRegression(input=self.layers[-1].output,n_in=hidden_layers[-1],n_out=n_outs)
		
		self.layers.append(self.logLayer)
		self.params.extend(self.logLayer.params)
		self.delta_params.extend(self.logLayer.delta_params)
		
		self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
		self.errors = self.logLayer.errors(self.y)
		self.output = self.logLayer.prediction()
out = T.concatenate([video_cnn.out, dbn.sigmoid_layers[-1].output], axis=1)

# some activation inspection
insp = []
for insp_temp in video_cnn.insp_mean:
    insp.append(insp_temp)
for layer in dbn.sigmoid_layers:
    insp.append(T.mean(layer.output))

# ------------------------------------------------------------------------------
#MLP layer
layers.append(
    HiddenLayer(out,
                n_in=net.hidden,
                n_out=net.hidden,
                rng=tr.rng,
                W_scale=net.W_scale[-1],
                b_scale=net.b_scale[-1],
                activation=net.activation))
out = layers[-1].output

if tr.inspect: insp.append(T.mean(out))
if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output

insp = T.stack(insp)

# softmax layer
layers.append(
    LogRegr(out,
            rng=tr.rng,
            n_in=net.hidden,
#maxout
if use.maxout:
    vid_ = maxout(vid_, (batch.micro,n_in_MLP))
    net.activation = lin
    n_in_MLP /= 2
    # net.hidden *= 2

# MLP
# ------------------------------------------------------------------------------
# fusion
if net.fusion == "early":
    out = vid_
    # hidden layer
    if use.load:
        Wh, bh = load_params(use)  # This is test, wudi added this!
        layers.append(HiddenLayer(out, W = Wh, b =bh, n_in=n_in_MLP, n_out=net.hidden, rng=tr.rng, 
            W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=leaky_relu))
    else:
        layers.append(HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden, rng=tr.rng, 
            W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=leaky_relu))
    out = layers[-1].output


if tr.inspect: insp = T.stack(insp[0],insp[1],insp[2],insp[3],insp[4],insp[5], T.mean(out))
else: insp =  T.stack(0,0)
# out = normalize(out)
if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
#maxout
if use.maxout:
    out = maxout(out, (batch.micro,net.hidden))
    net.hidden /= 2