class SceneMlp(object): """ multi-layer perceptron used to predict scene-specific context """ def __init__(self, name='scene_mlp', layer_sizes=(2048, 1024, 1024, 80), model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: layer_sizes = f.attrs['layer_sizes'] self.config = {'layer_sizes': layer_sizes} # define inputs x = T.matrix('x') y = T.matrix('y') self.inputs = [x, y] # define computation graph self.mlp = MLP(layer_sizes=layer_sizes, name='mlp', output_type='softmax') self.proba = self.mlp.compute(x) self.log_proba = T.log(self.proba) # define costs def kl_divergence(p, q): kl = T.mean(T.sum(p * T.log((p+1e-30)/(q+1e-30)), axis=1)) kl += T.mean(T.sum(q * T.log((q+1e-30)/(p+1e-30)), axis=1)) return kl kl = kl_divergence(self.proba, y) acc = T.mean(T.eq(self.proba.argmax(axis=1), y.argmax(axis=1))) self.costs = [kl, acc] # layers and parameters self.layers = [self.mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) def save_to_dir(self, save_dir, idx='0'): save_file = osp.join(save_dir, self.name+'.h5.' + str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
class Model(object): """ Region Attention model """ def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nnh = f.attrs['nnh'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w = T.concatenate([e_t, word_vec], axis=-1) c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1) # (mb,nh) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # predict word probability p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1)) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat): # update state new_state, p, alpha = self.compute(state, w_tm1, feat) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
class Model(object): """ scene-specific contexts """ def __init__(self, name='ss', nimg=2048, nh=512, nw=512, nout=8843, ns=80, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] ns = f.attrs['ns'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout, 'ns': ns} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2*nh], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw+ns, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') scene = T.matrix('scene') self.inputs = [cap, img, scene] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None], non_sequences=[scene]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # initialization for test stage self._init_func = None self._step_func = None self._scene_shared = theano.shared(np.zeros((1, ns)).astype(theano.config.floatX)) def compute(self, state, w_idx, scene): # word embedding word_vec = self.embedding.compute(w_idx) # split states c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])]) # lstm step w_s = T.concatenate([word_vec, scene], axis=1) c_t, h_t = self.lstm.compute(w_s, c_tm1, h_tm1) # merge state new_state = T.concatenate([c_t, h_t], axis=-1) # add w_{t-1} as feature h_and_w = T.concatenate([h_t, word_vec], axis=-1) # predict probability p = self.pred_mlp.compute(h_and_w) return new_state, p def scan_func(self, w_tm1, w_t, state, scene): # update state new_state, p = self.compute(state, w_tm1, scene) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss def init_func(self, img_value, scene_value): if self._init_func is None: img = T.matrix() init_state = self.proj_mlp.compute(img) self._init_func = theano.function([img], init_state) self._scene_shared.set_value(scene_value) return self._init_func(img_value) def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p = self.compute(state, w, self._scene_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
class Model(object): """ an re-implementation of google NIC system, used as the baseline in our paper """ def __init__(self, name='gnic', nimg=2048, nh=512, nw=512, nout=8843, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') self.inputs = [cap, img] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions are used in test stage self._init_func = None self._step_func = None def compute(self, state, w_idx): # word embedding word_vec = self.embedding.compute(w_idx) # split states c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])]) # lstm step c_t, h_t = self.lstm.compute(word_vec, c_tm1, h_tm1) # merge state new_state = T.concatenate([c_t, h_t], axis=-1) # add w_{t-1} as feature h_and_w = T.concatenate([h_t, word_vec], axis=-1) # predict probability p = self.pred_mlp.compute(h_and_w) return new_state, p def scan_func(self, w_tm1, w_t, state): # update state new_state, p = self.compute(state, w_tm1) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss def init_func(self, img_value): if self._init_func is None: img = T.matrix() init_state = self.proj_mlp.compute(img) self._init_func = theano.function([img], init_state) return self._init_func(img_value) def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p = self.compute(state, w) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name + '.h5.' + str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
class Model(object): """ Region Attention model """ def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w = T.concatenate([e_t, word_vec], axis=-1) c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1) # (mb,nh) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # predict word probability p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1)) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat): # update state new_state, p, alpha = self.compute(state, w_tm1, feat) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)