def encoder(self, questions, questionLengths, projWords = False, projQuestion = False, projDim = None): with tf.variable_scope("encoder"): # variational dropout option varDp = None if config.encVariationalDropout: varDp = {"stateDp": self.dropouts["stateInput"], "inputDp": self.dropouts["encInput"], "inputSize": config.wrdEmbDim} # rnns for i in range(config.encNumLayers): questionCntxWords, vecQuestions = ops.RNNLayer(questions, questionLengths, config.encDim, bi = config.encBi, cellType = config.encType, dropout = self.dropouts["encInput"], varDp = varDp, name = "rnn%d" % i) # dropout for the question vector vecQuestions = tf.nn.dropout(vecQuestions, self.dropouts["question"]) # projection of encoder outputs if projWords: questionCntxWords = ops.linear(questionCntxWords, config.encDim, projDim, name = "projCW") if projQuestion: vecQuestions = ops.linear(vecQuestions, config.encDim, projDim, act = config.encProjQAct, name = "projQ") return questionCntxWords, vecQuestions
def control(self, controlInput, inWords, outWords, questionLengths, control, contControl = None, name = "", reuse = None): with tf.variable_scope("control" + name, reuse = reuse): dim = config.ctrlDim ## Step 1: compute "continuous" control state given previous control and question. # control inputs: question and previous control newContControl = controlInput if config.controlFeedPrev: newContControl = control if config.controlFeedPrevAtt else contControl if config.controlFeedInputs: newContControl = tf.concat([newContControl, controlInput], axis = -1) dim += config.ctrlDim # merge inputs together newContControl = ops.linear(newContControl, dim, config.ctrlDim, act = config.controlContAct, name = "contControl") dim = config.ctrlDim ## Step 2: compute attention distribution over words and sum them up accordingly. # compute interactions with question words interactions = tf.expand_dims(newContControl, axis = 1) * inWords # optionally concatenate words if config.controlConcatWords: interactions = tf.concat([interactions, inWords], axis = -1) dim += config.ctrlDim # optional projection if config.controlProj: interactions = ops.linear(interactions, dim, config.ctrlDim, act = config.controlProjAct) dim = config.ctrlDim # compute attention distribution over words and summarize them accordingly logits = ops.inter2logits(interactions, dim) # self.interL = (interW, interb) # if config.controlCoverage: # logits += coverageBias * coverage attention = tf.nn.softmax(ops.expMask(logits, questionLengths)) self.attentions["question"].append(attention) # if config.controlCoverage: # coverage += attention # Add logits instead? newControl = ops.att2Smry(attention, outWords) # ablation: use continuous control (pre-attention) instead if config.controlContinuous: newControl = newContControl return newControl, newContControl
def zero_state(self, batchSize, dtype = tf.float32): ## initialize data-structures self.attentions = {"kb": [], "question": [], "self": [], "gate": []} self.autoEncLosses = {"control": tf.constant(0.0), "memory": tf.constant(0.0)} ## initialize state initialControl = self.initState("initCtrl", config.ctrlDim, config.initCtrl, batchSize) initialMemory = self.initState("initMem", config.memDim, config.initMem, batchSize) self.controls = tf.expand_dims(initialControl, axis = 1) self.memories = tf.expand_dims(initialMemory, axis = 1) self.infos = tf.expand_dims(initialMemory, axis = 1) self.contControl = initialControl # self.contControls = tf.expand_dims(initialControl, axis = 1) # self.postControls = tf.expand_dims(initialControl, axis = 1) ## initialize knowledge base # optionally merge question into knowledge base representation if config.initKBwithQ != "NON": iVecQuestions = ops.linear(self.vecQuestions, config.ctrlDim, config.memDim, name = "questions") concatMul = (config.initKBwithQ == "MUL") cnct, dim = ops.concat(self.knowledgeBase, iVecQuestions, config.memDim, mul = concatMul, expandY = True) self.knowledgeBase = ops.linear(cnct, dim, config.memDim, name = "initKB") ## initialize question words # choose question words to work with (original embeddings or encoder outputs) words = self.questionCntxWords if config.controlContextual else self.questionWords # optionally add parametric "null" word in the to all questions if config.addNullWord: words, questionLengths = self.addNullWord(words, questionLengths) # project words self.inWords = self.outWords = words if config.controlInWordsProj or config.controlOutWordsProj: pWords = ops.linear(words, config.ctrlDim, config.ctrlDim, name = "wordsProj") self.inWords = pWords if config.controlInWordsProj else words self.outWords = pWords if config.controlOutWordsProj else words # if config.controlCoverage: # self.coverage = tf.zeros((batchSize, tf.shape(words)[1]), dtype = tf.float32) # self.coverageBias = tf.get_variable("coverageBias", shape = (), # initializer = config.controlCoverageBias) ## initialize memory variational dropout mask if config.memoryVariationalDropout: self.memDpMask = ops.generateVarDpMask((batchSize, config.memDim), self.dropouts["memory"]) return MACCellTuple(initialControl, initialMemory)
def memAutoEnc(newMemory, info, control, name = "", reuse = None): with tf.variable_scope("memAutoEnc" + name, reuse = reuse): # inputs to auto encoder features = info if config.autoEncMemInputs == "INFO" else newMemory features = ops.linear(features, config.memDim, config.ctrlDim, act = config.autoEncMemAct, name = "aeMem") # reconstruct control if config.autoEncMemLoss == "CONT": loss = tf.reduce_mean(tf.squared_difference(control, features)) else: interactions, dim = ops.mul(self.questionCntxWords, features, config.ctrlDim, concat = {"x": config.autoEncMemCnct}, mulBias = config.mulBias, name = "aeMem") logits = ops.inter2logits(interactions, dim) logits = self.expMask(logits, self.questionLengths) # reconstruct word attentions if config.autoEncMemLoss == "PROB": loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( labels = self.attentions["question"][-1], logits = logits)) # reconstruct control through words attentions else: attention = tf.nn.softmax(logits) summary = ops.att2Smry(attention, self.questionCntxWords) loss = tf.reduce_mean(tf.squared_difference(control, summary)) return loss
def outputOp(self, memory, vecQuestions, images, imageInDim): with tf.variable_scope("outputUnit"): features = memory dim = config.memDim if config.outQuestion: eVecQuestions = ops.linear(vecQuestions, config.ctrlDim, config.memDim, name = "outQuestion") features, dim = ops.concat(features, eVecQuestions, config.memDim, mul = config.outQuestionMul) if config.outImage: images, imagesDim = ops.linearizeFeatures(images, self.H, self.W, self.imageInDim, outputDim = config.outImageDim) images = ops.linear(images, config.memDim, config.outImageDim, name = "outImage") features = tf.concat([features, images], axis = -1) dim += config.outImageDim return features, dim
def __call__(self, inputs, state, scope = None): scope = scope or type(self).__name__ with tf.variable_scope(scope, reuse = self.reuse): # as tfscope control = state.control memory = state.memory # cell sharing inputName = "qInput" inputNameU = "qInputU" inputReuseU = inputReuse = (self.iteration > 0) if config.controlInputUnshared: inputNameU = "qInput%d" % self.iteration inputReuseU = None cellName = "" cellReuse = (self.iteration > 0) if config.unsharedCells: cellName = str(self.iteration) cellReuse = None ## control unit # prepare question input to control controlInput = ops.linear(self.vecQuestions, config.ctrlDim, config.ctrlDim, name = inputName, reuse = inputReuse) controlInput = ops.activations[config.controlInputAct](controlInput) controlInput = ops.linear(controlInput, config.ctrlDim, config.ctrlDim, name = inputNameU, reuse = inputReuseU) newControl, self.contControl = self.control(controlInput, self.inWords, self.outWords, self.questionLengths, control, self.contControl, name = cellName, reuse = cellReuse) # read unit # ablation: use whole question as control if config.controlWholeQ: newControl = self.vecQuestions # ops.linear(self.vecQuestions, config.ctrlDim, projDim, name = "qMod") info = self.read(self.knowledgeBase, memory, newControl, name = cellName, reuse = cellReuse) if config.writeDropout < 1.0: # write unit info = tf.nn.dropout(info, self.dropouts["write"]) newMemory = self.write(memory, info, newControl, self.contControl, name = cellName, reuse = cellReuse) # add auto encoder loss for memory # if config.autoEncMem: # self.autoEncLosses["memory"] += memAutoEnc(newMemory, info, newControl) # append as standard list? self.controls = tf.concat([self.controls, tf.expand_dims(newControl, axis = 1)], axis = 1) self.memories = tf.concat([self.memories, tf.expand_dims(newMemory, axis = 1)], axis = 1) self.infos = tf.concat([self.infos, tf.expand_dims(info, axis = 1)], axis = 1) # self.contControls = tf.concat([self.contControls, tf.expand_dims(contControl, axis = 1)], axis = 1) # self.postControls = tf.concat([self.controls, tf.expand_dims(postControls, axis = 1)], axis = 1) newState = MACCellTuple(newControl, newMemory) return self.none, newState
def write(self, memory, info, control, contControl = None, name = "", reuse = None): with tf.variable_scope("write" + name, reuse = reuse): # optionally project info if config.writeInfoProj: info = ops.linear(info, config.memDim, config.memDim, name = "info") # optional info nonlinearity info = ops.activations[config.writeInfoAct](info) # compute self-attention vector based on previous controls and memories if config.writeSelfAtt: selfControl = control if config.writeSelfAttMod == "CONT": selfControl = contControl # elif config.writeSelfAttMod == "POST": # selfControl = postControl selfControl = ops.linear(selfControl, config.ctrlDim, config.ctrlDim, name = "ctrlProj") interactions = self.controls * tf.expand_dims(selfControl, axis = 1) # if config.selfAttShareInter: # selfAttlogits = self.linearP(selfAttInter, config.encDim, 1, self.interL[0], self.interL[1], name = "modSelfAttInter") attention = ops.inter2att(interactions, config.ctrlDim, name = "selfAttention") self.attentions["self"].append(attention) selfSmry = ops.att2Smry(attention, self.memories) # get write unit inputs: previous memory, the new info, optionally self-attention / control newMemory, dim = memory, config.memDim if config.writeInputs == "INFO": newMemory = info elif config.writeInputs == "SUM": newMemory += info elif config.writeInputs == "BOTH": newMemory, dim = ops.concat(newMemory, info, dim, mul = config.writeConcatMul) # else: MEM if config.writeSelfAtt: newMemory = tf.concat([newMemory, selfSmry], axis = -1) dim += config.memDim if config.writeMergeCtrl: newMemory = tf.concat([newMemory, control], axis = -1) dim += config.memDim # project memory back to memory dimension if config.writeMemProj or (dim != config.memDim): newMemory = ops.linear(newMemory, dim, config.memDim, name = "newMemory") # optional memory nonlinearity newMemory = ops.activations[config.writeMemAct](newMemory) # write unit gate if config.writeGate: gateDim = config.memDim if config.writeGateShared: gateDim = 1 z = tf.sigmoid(ops.linear(control, config.ctrlDim, gateDim, name = "gate", bias = config.writeGateBias)) self.attentions["gate"].append(z) newMemory = newMemory * z + memory * (1 - z) # optional batch normalization if config.memoryBN: newMemory = tf.contrib.layers.batch_norm(newMemory, decay = config.bnDecay, center = config.bnCenter, scale = config.bnScale, is_training = self.train, updates_collections = None) return newMemory
def read(self, knowledgeBase, memory, control, name = "", reuse = None): with tf.variable_scope("read" + name, reuse = reuse): dim = config.memDim ## memory dropout if config.memoryVariationalDropout: memory = ops.applyVarDpMask(memory, self.memDpMask, self.dropouts["memory"]) else: memory = tf.nn.dropout(memory, self.dropouts["memory"]) ## Step 1: knowledge base / memory interactions # parameters for knowledge base and memory projection proj = None if config.readProjInputs: proj = {"dim": config.attDim, "shared": config.readProjShared, "dropout": self.dropouts["read"] } dim = config.attDim # parameters for concatenating knowledge base elements concat = {"x": config.readMemConcatKB, "proj": config.readMemConcatProj} # compute interactions between knowledge base and memory interactions, interDim = ops.mul(x = knowledgeBase, y = memory, dim = config.memDim, proj = proj, concat = concat, interMod = config.readMemAttType, name = "memInter") projectedKB = proj.get("x") if proj else None # project memory interactions back to hidden dimension if config.readMemProj: interactions = ops.linear(interactions, interDim, dim, act = config.readMemAct, name = "memKbProj") else: dim = interDim ## Step 2: compute interactions with control if config.readCtrl: # compute interactions with control if config.ctrlDim != dim: control = ops.linear(control, ctrlDim, dim, name = "ctrlProj") interactions, interDim = ops.mul(interactions, control, dim, interMod = config.readCtrlAttType, concat = {"x": config.readCtrlConcatInter}, name = "ctrlInter") # optionally concatenate knowledge base elements if config.readCtrlConcatKB: if config.readCtrlConcatProj: addedInp, addedDim = projectedKB, config.attDim else: addedInp, addedDim = knowledgeBase, config.memDim interactions = tf.concat([interactions, addedInp], axis = -1) dim += addedDim # optional nonlinearity interactions = ops.activations[config.readCtrlAct](interactions) ## Step 3: sum attentions up over the knowledge base # transform vectors to attention distribution attention,logits = ops.inter2att(interactions, dim, dropout = self.dropouts["read"],flag=True) self.attentions["kb"].append(attention) # optionally use projected knowledge base instead of original if config.readSmryKBProj: knowledgeBase = projectedKB # sum up the knowledge base according to the distribution information = ops.att2Smry(attention, knowledgeBase) return information