def GetQaPredFromJson(data, humanId): ''' currently human id ranges from 1 to 2 0 th answer is the orginal ground truth ''' predictions = dict() for article in data: title = article["title"] for paragraph in article["paragraphs"]: for qas in paragraph["qas"]: qaId = qas["id"] print len(qas["answers"] ) ansStr = qas["answers"][humanId]["text"].lower().strip() ansStr = CleanAnswer(ansStr) # tokens = ansStr.split(" ") # if tokens[0] == "the": # tokens = tokens[1:] # if tokens[-1] == ".": # tokens = tokens[:-1] # ansStr = " ".join(tokens).strip() queryStr = qas["question"].lower().strip() pred = QaPrediction(title, qaId, ansStr, queryStr = queryStr) pred.queryStr = queryStr predictions[qaId] = (pred, ) return predictions
def PredictPerArticle(self, title, returnDict): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' print "Predicting for " + title # get unigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # flatten version of contextUni context = [] if self.articleLevel: for paraUni in contextUni: for sentenceUni in paraUni: context += sentenceUni else: for paraUni in contextUni: contextPara = [] for sentenceUni in paraUni: contextPara += sentenceUni context.append(contextPara) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: # pred = dict() preds = list() scores = list() # bestScore = sys.float_info.min qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] if self.articleLevel: for iPara, para, unigrams, spans \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextSpan) ): # traverse each sentence in the paragraph for iSen, (s, uni, spanList) in enumerate( zip(para.context.sentence, unigrams, spans)): assert len(s.token) == len(uni) for span in spanList: beginId = span[0] endId = span[1] aUnigram = uni[beginId:endId] score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \ - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram) scores.append(score) ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] preds.append( QaPrediction(title, qaId, ansStr, iPara, iSen, ansToken=ansToken)) else: iPara = qaParaId para = self.data[title].paragraphs[iPara] unigrams = contextUni[iPara] spans = contextSpan[iPara] for iSen, (s, uni, spanList) in enumerate( zip(para.context.sentence, unigrams, spans)): assert len(s.token) == len(uni) for span in spanList: beginId = span[0] endId = span[1] aUnigram = uni[beginId:endId] score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \ - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram) scores.append(score) ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] preds.append( QaPrediction(title, qaId, ansStr, iPara, iSen, ansToken=ansToken, score=score)) scores = np.array(scores) preds = np.array(preds) scoreOrder = np.argsort(-scores) predictions[qaId] = preds[scoreOrder][ 0:min(self.topK, preds.size)].tolist() returnDict[title] = predictions
def PredictPerArticle(self, title, returnDict): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' # get unigram and bigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) contextBi = self.GetContextBigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() print "Predicting for " + title for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: pred = dict() bestScore = sys.float_info.min bestSentence = None qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] qBigram = self.GetBigramBySentence(qS.token) # traverse over paragraphs pred = dict() pred["answer"] = [] pred["token"] = [] # if ansSentence not in pred["sentence"]: pred["sentence"] = [] pred["sentenceToken"] = [] pred["paraId"] = [] pred["senId"] = [] for iPara, (para, unigrams, bigrams, spans) \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ): # traverse each sentence in the paragraph if self.articleLevel == False and iPara != qaParaId: continue for iSen, (s, uni, bi, spanList) in enumerate( zip(para.context.sentence, unigrams, bigrams, spans)): assert len(s.token) == len(uni) assert len(s.token) == len(bi) + 1 for span in spanList: beginId = span[0] endId = span[1] cbUnigram = uni[0:beginId] caUnigram = uni[endId:] if beginId == 0: cbBigram = [] else: cbBigram = bi[0:(beginId - 1)] caBigram = bi[endId:] aUnigram = uni[beginId:endId] aBigram = bi[beginId:(endId - 1)] # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"): if len(aUnigram ) == 1 and aUnigram[0] in self.stopWords: continue if len(aUnigram) > 3: continue ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] assert len(s.token) != 0 # ansSentenceToken = [token.word for token in s.token] pred["answer"].append(ansStr) pred["token"].append(ansToken) # if ansSentence not in pred["sentence"]: pred["sentence"].append(ansSentence) pred["sentenceToken"].append(ansSentenceToken) pred["paraId"].append(iPara) pred["senId"].append(iSen) # filter from the candidates for best choice preds = list() for ansToken, sentenceToken, ansStr, iPara, iSen \ in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ): ansToken = [token.word.lower() for token in ansToken] if ansToken[-1] == ".": ansToken = ansToken[:-1] if ansToken[0] == "The" or ansToken[0] == "the": ansToken = ansToken[1:] if len(ansToken) == 0: print " zero length ans token" preds.append( QaPrediction(title, qa.id, ansStr, iPara, iSen, ansToken=ansToken)) random.shuffle(preds) predictions[qa.id] = preds[0:min(self.topK, len(preds))] predictions[qa.id] = predictions[qa.id][0].ansStr returnDict[title] = predictions
def PredictPerArticle(self, title, returnDict): # def PredictPerArticle(self, title): ''' @param returnDict: used to get return value from different processes launched from multiprocessing ''' # get unigram and bigram for the context article = self.data[title] contextUni = self.GetContextUnigram(article) contextBi = self.GetContextBigram(article) # get all candidate span for the context contextSpan = self.GetContextConstituentSpan(article) # the questions are organized according to paragraph # but candidates are generated from the whole passage predictions = dict() # print "Predicting for " + title for qaParaId, paragraph in enumerate(self.data[title].paragraphs): # predByPara = list() for qa in paragraph.qas: pred = dict() bestScore = sys.float_info.min bestSentence = None qS = qa.question.sentence[0] qUnigram = [token.word.lower() for token in qS.token] qBigram = self.GetBigramBySentence(qS.token) # traverse over paragraphs pred = dict() for iPara, (para, unigrams, bigrams, spans) \ in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ): # traverse each sentence in the paragraph if self.articleLevel == False and iPara != qaParaId: continue for iSen, (s, uni, bi, spanList) in enumerate( zip(para.context.sentence, unigrams, bigrams, spans)): assert len(s.token) == len(uni) assert len(s.token) == len(bi) + 1 for span in spanList: beginId = span[0] endId = span[1] cbUnigram = uni[0:beginId] caUnigram = uni[endId:] if beginId == 0: cbBigram = [] else: cbBigram = bi[0:(beginId - 1)] caBigram = bi[endId:] aUnigram = uni[beginId:endId] aBigram = bi[beginId:(endId - 1)] # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"): if len(aUnigram) == 1 and aUnigram[ 0] in self.slidingWindowAgent.stopWords: continue score = self.GetContextScore( qUnigram, qBigram, cbUnigram, caUnigram, cbBigram, caBigram, aUnigram, aBigram, self.stopWords) if score > bestScore or len(pred) == 0: ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] pred = { "id": qa.id, "answer": [ ansStr, ], "token": [ ansToken, ], "sentence": [ ansSentence, ], "sentenceToken": [ ansSentenceToken, ], "paraId": [ iPara, ], "senId": [ iSen, ] } bestScore = score # the current dataset has empty s.text field # assert ansStr in s.text # note we permit multiple answers elif score == bestScore: ansStr = ReconstructStrFromSpan(s.token, span) ansToken = s.token[span[0]:span[1]] ansSentence = ReconstructStrFromSpan( s.token, (0, len(s.token))) ansSentenceToken = s.token[0:len(s.token)] assert len(s.token) != 0 # ansSentenceToken = [token.word for token in s.token] pred["answer"].append(ansStr) pred["token"].append(ansToken) # if ansSentence not in pred["sentence"]: pred["sentence"].append(ansSentence) pred["sentenceToken"].append(ansSentenceToken) pred["paraId"].append(iPara) pred["senId"].append(iSen) # filter from the candidates for best choice preds = list() if self.slidingWindowAgent != None: subAgent = self.slidingWindowAgent slidingScores = [] for ansToken, sentenceToken, ansStr, iPara, iSen \ in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ): ansToken = [token.word.lower() for token in ansToken] if ansToken[-1] == ".": ansToken = ansToken[:-1] if ansToken[0] == "The" or ansToken[0] == "the": ansToken = ansToken[1:] if len(ansToken) == 0: print " zero length ans token" sentenceToken = [ token.word.lower() for token in sentenceToken ] slidingScores.append( subAgent.GetSlidingDistScore( sentenceToken, qUnigram, ansToken)) preds.append( QaPrediction(title, qa.id, ansStr, iPara, iSen, ansToken=ansToken, score=slidingScores[-1])) slidingScores = np.array(slidingScores) preds = np.array(preds) scoreOrder = np.argsort(-slidingScores) predictions[qa.id] = preds[scoreOrder[ 0:min(self.topK, scoreOrder.size)]].tolist() # cut for the top 1 prediction predictions[qa.id] = predictions[qa.id][0].ansStr returnDict[title] = predictions
def Predict(self, samples, candInput, candGlobalId, candData, origData, session): ''' predict each sample in samples with the related data in candInput. @param candInput: it can be either self.trainCandInput or self.evalCandInput candInput is produced via self.PrepareEvalInput we reuse the interface of paRnnInput and pcRnnInput to get the scores @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId @param candData: either self.traincandData or self.evalcandData @param origData: either self.trainOrigData or self.evalOrigData ''' candPadded, contextPadded, candLen, contextLen = candInput prediction = dict() #set this to reuse training computational graph for evaluation batchSize = self.batchSize for iSample, sample in enumerate(samples): title = sample.title qaId = sample.id if self.articleLevel: nCand = len(candPadded[title] ) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) ) qSeqLen = np.array( (len(sample.query), ) ) paRnnInput = candPadded[title] paSeqLen = candLen[title] pcRnnInput = contextPadded[title] pcSeqLen = contextLen[title] else: paraId = sample.pAnsParaId nCand = len(candPadded[title][paraId] ) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) ) qSeqLen = np.array( (len(sample.query), ) ) paRnnInput = candPadded[title][paraId] paSeqLen = candLen[title][paraId] pcRnnInput = contextPadded[title][paraId] pcSeqLen = contextLen[title][paraId] dataBatch = {self.qRnnInputEval : qRnnInput, self.aRnnInputEval : paRnnInput, self.cRnnInputEval : pcRnnInput, self.qSeqLenEval : qSeqLen, self.aSeqLenEval : paSeqLen, self.cSeqLenEval : pcSeqLen} dataBatch = self.GetPredictBatch(dataBatch) scores = session.run(self.evalScore, feed_dict=dataBatch) predId = np.argmax(scores) if self.articleLevel == False: # from paragraph level span id to article level span id globalId = [idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen] predId = globalId[predId] predInfo = candData[title].candidateAnswers[predId] predParaId = predInfo.paragraphIndex predSenId = predInfo.sentenceIndex predSpanStart = predInfo.spanBeginIndex predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength tokens = origData[title].paragraphs[predParaId].context.sentence[predSenId].token[predSpanStart:predSpanEnd] predStr = ReconstructStrFromSpan(tokens, (0, len(tokens) ) ) prediction[qaId] = QaPrediction(title, qaId, predStr, predParaId, predSenId) if (iSample + 1) % 500 == 0 or iSample == len(samples) - 1: print "predicted ", str(iSample + 1), " / " , str(len(samples) ), " samples!" predForDump = dict() for qaId in prediction.keys(): predForDump[qaId] = prediction[qaId].ansStr return prediction, predForDump
def Predict(self, samples, candInput, candGlobalId, candData, origData, session): ''' predict each sample in samples with the related data in candInput. @param candInput: it can be either self.trainCandInput or self.evalCandInput candInput is produced via self.PrepareEvalInput we reuse the interface of paRnnInput and pcRnnInput to get the scores @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId @param candData: either self.traincandData or self.evalcandData @param origData: either self.trainOrigData or self.evalOrigData ''' candPadded, contextPadded, candLen, contextLen = candInput prediction = dict() topK = self.predTopK for iSample, sample in enumerate(samples): title = sample.title qaId = sample.id if self.articleLevel: nCand = len(candPadded[title]) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query))) qSeqLen = np.array((len(sample.query), )) paRnnInput = candPadded[title] paSeqLen = candLen[title] pcRnnInput = contextPadded[title] pcSeqLen = contextLen[title] else: paraId = sample.pAnsParaId nCand = len(candPadded[title][paraId]) qRnnInput = np.array(sample.query).reshape( (1, len(sample.query))) qSeqLen = np.array((len(sample.query), )) paRnnInput = candPadded[title][paraId] paSeqLen = candLen[title][paraId] pcRnnInput = contextPadded[title][paraId] pcSeqLen = contextLen[title][paraId] batchData = { self.qRnnInputEval: qRnnInput, self.aRnnInputEval: paRnnInput, self.cRnnInputEval: pcRnnInput, self.qSeqLenEval: qSeqLen, self.aSeqLenEval: paSeqLen, self.cSeqLenEval: pcSeqLen } # # # DEBUG # # print qaId, # # print "pred data sum", np.sum(batchData[self.aRnnInputEval] ), \ # # np.sum(batchData[self.cRnnInputEval] ) # print "test input ", paRnnInput[123, :], paSeqLen[123] # print "test input 2 ", pcRnnInput[123, :], pcSeqLen[123] # # # END of DEBUG batchData = self.GetPredictBatch(batchData) scores = session.run(self.evalScore, feed_dict=batchData) # # # DEBUG # print "\n\n\n\n test pa score ", np.argmax(scores), np.max(scores) # for i in range(batchData[self.aRnnInputEval].shape[0] ): # print "\n test data na ", i, scores[0, i] # print self.IdToWord(batchData[self.aRnnInputEval][i, :].tolist() ) # print self.IdToWord(batchData[self.cRnnInputEval][i, :].tolist() ) # # raw_input("done") # print self.IdToWord(paRnnInput[np.argmax(scores), :].tolist() ) # print self.IdToWord(pcRnnInput[np.argmax(scores), :].tolist() ) # print self.IdToWord(qRnnInput[0, :].tolist() ) # predict a topK list predIdSort = np.argsort(-scores[0, :]) prediction[qaId] = list() for i in range(min(topK, scores.size)): predId = predIdSort[i] if self.articleLevel == False: # from paragraph level span id to article level span id globalId = [ idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen ] predId = globalId[predId] predInfo = candData[title].candidateAnswers[predId] predParaId = predInfo.paragraphIndex predSenId = predInfo.sentenceIndex predSpanStart = predInfo.spanBeginIndex predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength tokens = origData[title].paragraphs[ predParaId].context.sentence[predSenId].token[ predSpanStart:predSpanEnd] predStr = ReconstructStrFromSpan(tokens, (0, len(tokens))) prediction[qaId].append( QaPrediction(title, qaId, predStr, predParaId, predSenId, ansToken=tokens)) return prediction