示例#1
0
def GetQaPredFromJson(data, humanId):
    '''
    currently human id ranges from 1 to 2
    0 th answer is the orginal ground truth
    '''
    predictions = dict()
    for article in data:
        title = article["title"]
        for paragraph in article["paragraphs"]:
            for qas in paragraph["qas"]:
                qaId = qas["id"]

                print len(qas["answers"] )


                ansStr = qas["answers"][humanId]["text"].lower().strip()
                ansStr = CleanAnswer(ansStr)
                # tokens = ansStr.split(" ")
                # if tokens[0] == "the":
                #     tokens = tokens[1:]
                # if tokens[-1] == ".":
                #     tokens = tokens[:-1]
                # ansStr = " ".join(tokens).strip()
                queryStr = qas["question"].lower().strip()
                pred = QaPrediction(title, qaId, ansStr, queryStr = queryStr)
                pred.queryStr = queryStr
                predictions[qaId] = (pred, )
    return predictions
示例#2
0
    def PredictPerArticle(self, title, returnDict):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        print "Predicting for " + title
        # get unigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # flatten version of contextUni
        context = []

        if self.articleLevel:
            for paraUni in contextUni:
                for sentenceUni in paraUni:
                    context += sentenceUni
        else:
            for paraUni in contextUni:
                contextPara = []
                for sentenceUni in paraUni:
                    contextPara += sentenceUni
                context.append(contextPara)

        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                # pred = dict()
                preds = list()
                scores = list()
                # bestScore = sys.float_info.min
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                if self.articleLevel:
                    for iPara, para, unigrams, spans \
                        in enumerate(zip(self.data[title].paragraphs, contextUni, contextSpan) ):
                        # traverse each sentence in the paragraph
                        for iSen, (s, uni, spanList) in enumerate(
                                zip(para.context.sentence, unigrams, spans)):
                            assert len(s.token) == len(uni)
                            for span in spanList:
                                beginId = span[0]
                                endId = span[1]
                                aUnigram = uni[beginId:endId]

                                score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \
                                    - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram)

                                scores.append(score)
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                preds.append(
                                    QaPrediction(title,
                                                 qaId,
                                                 ansStr,
                                                 iPara,
                                                 iSen,
                                                 ansToken=ansToken))
                else:
                    iPara = qaParaId
                    para = self.data[title].paragraphs[iPara]
                    unigrams = contextUni[iPara]
                    spans = contextSpan[iPara]
                    for iSen, (s, uni, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, spans)):
                        assert len(s.token) == len(uni)
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            aUnigram = uni[beginId:endId]

                            score = self.GetSlidingWindowScore(context, qUnigram, aUnigram) \
                                - self.lambDist * self.GetMinDistance(context, qUnigram, aUnigram)

                            scores.append(score)
                            ansStr = ReconstructStrFromSpan(s.token, span)
                            ansToken = s.token[span[0]:span[1]]
                            preds.append(
                                QaPrediction(title,
                                             qaId,
                                             ansStr,
                                             iPara,
                                             iSen,
                                             ansToken=ansToken,
                                             score=score))

                scores = np.array(scores)
                preds = np.array(preds)
                scoreOrder = np.argsort(-scores)
                predictions[qaId] = preds[scoreOrder][
                    0:min(self.topK, preds.size)].tolist()
        returnDict[title] = predictions
示例#3
0
    def PredictPerArticle(self, title, returnDict):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        # get unigram and bigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        contextBi = self.GetContextBigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        print "Predicting for " + title
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                pred = dict()
                bestScore = sys.float_info.min
                bestSentence = None
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                qBigram = self.GetBigramBySentence(qS.token)
                # traverse over paragraphs
                pred = dict()
                pred["answer"] = []
                pred["token"] = []
                # if ansSentence not in pred["sentence"]:
                pred["sentence"] = []
                pred["sentenceToken"] = []
                pred["paraId"] = []
                pred["senId"] = []
                for iPara, (para, unigrams, bigrams, spans) \
                    in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ):
                    # traverse each sentence in the paragraph
                    if self.articleLevel == False and iPara != qaParaId:
                        continue
                    for iSen, (s, uni, bi, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, bigrams,
                                spans)):
                        assert len(s.token) == len(uni)
                        assert len(s.token) == len(bi) + 1
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            cbUnigram = uni[0:beginId]
                            caUnigram = uni[endId:]
                            if beginId == 0:
                                cbBigram = []
                            else:
                                cbBigram = bi[0:(beginId - 1)]
                            caBigram = bi[endId:]
                            aUnigram = uni[beginId:endId]
                            aBigram = bi[beginId:(endId - 1)]
                            # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"):
                            if len(aUnigram
                                   ) == 1 and aUnigram[0] in self.stopWords:
                                continue
                            if len(aUnigram) > 3:
                                continue

                            ansStr = ReconstructStrFromSpan(s.token, span)
                            ansToken = s.token[span[0]:span[1]]
                            ansSentence = ReconstructStrFromSpan(
                                s.token, (0, len(s.token)))
                            ansSentenceToken = s.token[0:len(s.token)]
                            assert len(s.token) != 0
                            # ansSentenceToken = [token.word for token in s.token]
                            pred["answer"].append(ansStr)
                            pred["token"].append(ansToken)
                            # if ansSentence not in pred["sentence"]:
                            pred["sentence"].append(ansSentence)
                            pred["sentenceToken"].append(ansSentenceToken)
                            pred["paraId"].append(iPara)
                            pred["senId"].append(iSen)
                # filter from the candidates for best choice
                preds = list()
                for ansToken, sentenceToken, ansStr, iPara, iSen \
                    in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ):
                    ansToken = [token.word.lower() for token in ansToken]
                    if ansToken[-1] == ".":
                        ansToken = ansToken[:-1]
                    if ansToken[0] == "The" or ansToken[0] == "the":
                        ansToken = ansToken[1:]
                    if len(ansToken) == 0:
                        print " zero length ans token"
                    preds.append(
                        QaPrediction(title,
                                     qa.id,
                                     ansStr,
                                     iPara,
                                     iSen,
                                     ansToken=ansToken))
                random.shuffle(preds)

                predictions[qa.id] = preds[0:min(self.topK, len(preds))]
                predictions[qa.id] = predictions[qa.id][0].ansStr
        returnDict[title] = predictions
示例#4
0
    def PredictPerArticle(self, title, returnDict):
        # def PredictPerArticle(self, title):
        '''
        @param returnDict: used to get return value from 
        different processes launched from multiprocessing
        '''
        # get unigram and bigram for the context
        article = self.data[title]
        contextUni = self.GetContextUnigram(article)
        contextBi = self.GetContextBigram(article)
        # get all candidate span for the context
        contextSpan = self.GetContextConstituentSpan(article)
        # the questions are organized according to paragraph
        # but candidates are generated from the whole passage
        predictions = dict()
        # print "Predicting for " + title
        for qaParaId, paragraph in enumerate(self.data[title].paragraphs):
            # predByPara = list()
            for qa in paragraph.qas:
                pred = dict()
                bestScore = sys.float_info.min
                bestSentence = None
                qS = qa.question.sentence[0]
                qUnigram = [token.word.lower() for token in qS.token]
                qBigram = self.GetBigramBySentence(qS.token)
                # traverse over paragraphs
                pred = dict()
                for iPara, (para, unigrams, bigrams, spans) \
                    in enumerate(zip(self.data[title].paragraphs, contextUni, contextBi, contextSpan) ):
                    # traverse each sentence in the paragraph
                    if self.articleLevel == False and iPara != qaParaId:
                        continue
                    for iSen, (s, uni, bi, spanList) in enumerate(
                            zip(para.context.sentence, unigrams, bigrams,
                                spans)):
                        assert len(s.token) == len(uni)
                        assert len(s.token) == len(bi) + 1
                        for span in spanList:
                            beginId = span[0]
                            endId = span[1]
                            cbUnigram = uni[0:beginId]
                            caUnigram = uni[endId:]
                            if beginId == 0:
                                cbBigram = []
                            else:
                                cbBigram = bi[0:(beginId - 1)]
                            caBigram = bi[endId:]
                            aUnigram = uni[beginId:endId]
                            aBigram = bi[beginId:(endId - 1)]
                            # if len(aUnigram) == 1 and (aUnigram[0] == "." or aUnigram[0] == "?" or aUnigram[0] == "!" or aUnigram[0] == "the" or aUnigram[0] == "The"):
                            if len(aUnigram) == 1 and aUnigram[
                                    0] in self.slidingWindowAgent.stopWords:
                                continue
                            score = self.GetContextScore(
                                qUnigram, qBigram, cbUnigram, caUnigram,
                                cbBigram, caBigram, aUnigram, aBigram,
                                self.stopWords)

                            if score > bestScore or len(pred) == 0:
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                ansSentence = ReconstructStrFromSpan(
                                    s.token, (0, len(s.token)))
                                ansSentenceToken = s.token[0:len(s.token)]
                                pred = {
                                    "id": qa.id,
                                    "answer": [
                                        ansStr,
                                    ],
                                    "token": [
                                        ansToken,
                                    ],
                                    "sentence": [
                                        ansSentence,
                                    ],
                                    "sentenceToken": [
                                        ansSentenceToken,
                                    ],
                                    "paraId": [
                                        iPara,
                                    ],
                                    "senId": [
                                        iSen,
                                    ]
                                }
                                bestScore = score
                                # the current dataset has empty s.text field
                                # assert ansStr in s.text
                            # note we permit multiple answers
                            elif score == bestScore:
                                ansStr = ReconstructStrFromSpan(s.token, span)
                                ansToken = s.token[span[0]:span[1]]
                                ansSentence = ReconstructStrFromSpan(
                                    s.token, (0, len(s.token)))
                                ansSentenceToken = s.token[0:len(s.token)]
                                assert len(s.token) != 0
                                # ansSentenceToken = [token.word for token in s.token]
                                pred["answer"].append(ansStr)
                                pred["token"].append(ansToken)
                                # if ansSentence not in pred["sentence"]:
                                pred["sentence"].append(ansSentence)
                                pred["sentenceToken"].append(ansSentenceToken)
                                pred["paraId"].append(iPara)
                                pred["senId"].append(iSen)
                # filter from the candidates for best choice
                preds = list()
                if self.slidingWindowAgent != None:
                    subAgent = self.slidingWindowAgent
                    slidingScores = []
                    for ansToken, sentenceToken, ansStr, iPara, iSen \
                        in zip(pred["token"], pred["sentenceToken"], pred["answer"], pred["paraId"], pred["senId"] ):
                        ansToken = [token.word.lower() for token in ansToken]
                        if ansToken[-1] == ".":
                            ansToken = ansToken[:-1]
                        if ansToken[0] == "The" or ansToken[0] == "the":
                            ansToken = ansToken[1:]
                        if len(ansToken) == 0:
                            print " zero length ans token"
                        sentenceToken = [
                            token.word.lower() for token in sentenceToken
                        ]
                        slidingScores.append(
                            subAgent.GetSlidingDistScore(
                                sentenceToken, qUnigram, ansToken))
                        preds.append(
                            QaPrediction(title,
                                         qa.id,
                                         ansStr,
                                         iPara,
                                         iSen,
                                         ansToken=ansToken,
                                         score=slidingScores[-1]))

                    slidingScores = np.array(slidingScores)
                    preds = np.array(preds)
                    scoreOrder = np.argsort(-slidingScores)

                    predictions[qa.id] = preds[scoreOrder[
                        0:min(self.topK, scoreOrder.size)]].tolist()
                    # cut for the top 1 prediction
                    predictions[qa.id] = predictions[qa.id][0].ansStr
        returnDict[title] = predictions
示例#5
0
    def Predict(self, samples, candInput, candGlobalId, candData, origData, session):
        '''
        predict each sample in samples with the related data in candInput.
        @param candInput: it can be either self.trainCandInput or self.evalCandInput
        candInput is produced via self.PrepareEvalInput
        we reuse the interface of paRnnInput and pcRnnInput to get the scores
        @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId
        @param candData: either self.traincandData or self.evalcandData
        @param origData: either self.trainOrigData or self.evalOrigData
        '''
        candPadded, contextPadded, candLen, contextLen = candInput
        prediction = dict()
        #set this to reuse training computational graph for evaluation
        batchSize = self.batchSize
        for iSample, sample in enumerate(samples):
            title = sample.title
            qaId = sample.id
            if self.articleLevel:
                nCand = len(candPadded[title] )
                qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) )
                qSeqLen = np.array( (len(sample.query), ) )
                paRnnInput = candPadded[title]
                paSeqLen = candLen[title]
                pcRnnInput = contextPadded[title]
                pcSeqLen = contextLen[title]
            else:
                paraId = sample.pAnsParaId
                nCand = len(candPadded[title][paraId] )
                qRnnInput = np.array(sample.query).reshape( (1, len(sample.query) ) )
                qSeqLen = np.array( (len(sample.query), ) )
                paRnnInput = candPadded[title][paraId]
                paSeqLen = candLen[title][paraId]
                pcRnnInput = contextPadded[title][paraId]
                pcSeqLen = contextLen[title][paraId]

            dataBatch = {self.qRnnInputEval : qRnnInput,
                self.aRnnInputEval : paRnnInput, 
                self.cRnnInputEval : pcRnnInput,
                self.qSeqLenEval : qSeqLen,
                self.aSeqLenEval : paSeqLen,
                self.cSeqLenEval : pcSeqLen}
            dataBatch = self.GetPredictBatch(dataBatch)

            scores = session.run(self.evalScore, feed_dict=dataBatch)
            predId = np.argmax(scores)
            if self.articleLevel == False:
                # from paragraph level span id to article level span id
                globalId = [idx for idSen in candGlobalId[title][sample.pAnsParaId] for idx in idSen]
                predId = globalId[predId]
            predInfo = candData[title].candidateAnswers[predId]
            predParaId = predInfo.paragraphIndex
            predSenId = predInfo.sentenceIndex
            predSpanStart = predInfo.spanBeginIndex
            predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength

            tokens = origData[title].paragraphs[predParaId].context.sentence[predSenId].token[predSpanStart:predSpanEnd]
            predStr = ReconstructStrFromSpan(tokens, (0, len(tokens) ) )
            prediction[qaId] = QaPrediction(title, qaId, predStr, predParaId, predSenId)
            if (iSample + 1) % 500 == 0 or iSample == len(samples) - 1:
                print "predicted ", str(iSample + 1), " / " , str(len(samples) ), " samples!"
            predForDump = dict()
            for qaId in prediction.keys():
                predForDump[qaId] = prediction[qaId].ansStr
        return prediction, predForDump
示例#6
0
    def Predict(self, samples, candInput, candGlobalId, candData, origData,
                session):
        '''
        predict each sample in samples with the related data in candInput.
        @param candInput: it can be either self.trainCandInput or self.evalCandInput
        candInput is produced via self.PrepareEvalInput
        we reuse the interface of paRnnInput and pcRnnInput to get the scores
        @param candGlobalId: either self.trainCandGlobalId or evalCandGlobalId
        @param candData: either self.traincandData or self.evalcandData
        @param origData: either self.trainOrigData or self.evalOrigData
        '''
        candPadded, contextPadded, candLen, contextLen = candInput
        prediction = dict()
        topK = self.predTopK

        for iSample, sample in enumerate(samples):
            title = sample.title
            qaId = sample.id
            if self.articleLevel:
                nCand = len(candPadded[title])
                qRnnInput = np.array(sample.query).reshape(
                    (1, len(sample.query)))
                qSeqLen = np.array((len(sample.query), ))
                paRnnInput = candPadded[title]
                paSeqLen = candLen[title]
                pcRnnInput = contextPadded[title]
                pcSeqLen = contextLen[title]
            else:
                paraId = sample.pAnsParaId
                nCand = len(candPadded[title][paraId])
                qRnnInput = np.array(sample.query).reshape(
                    (1, len(sample.query)))
                qSeqLen = np.array((len(sample.query), ))
                paRnnInput = candPadded[title][paraId]
                paSeqLen = candLen[title][paraId]
                pcRnnInput = contextPadded[title][paraId]
                pcSeqLen = contextLen[title][paraId]

            batchData = {
                self.qRnnInputEval: qRnnInput,
                self.aRnnInputEval: paRnnInput,
                self.cRnnInputEval: pcRnnInput,
                self.qSeqLenEval: qSeqLen,
                self.aSeqLenEval: paSeqLen,
                self.cSeqLenEval: pcSeqLen
            }

            # # # DEBUG
            # # print qaId,
            # # print "pred data sum", np.sum(batchData[self.aRnnInputEval] ), \
            # #     np.sum(batchData[self.cRnnInputEval] )
            # print "test input ", paRnnInput[123, :], paSeqLen[123]
            # print "test input 2 ", pcRnnInput[123, :], pcSeqLen[123]
            # # # END of DEBUG

            batchData = self.GetPredictBatch(batchData)
            scores = session.run(self.evalScore, feed_dict=batchData)

            # # # DEBUG
            # print "\n\n\n\n test pa score ", np.argmax(scores), np.max(scores)
            # for i in range(batchData[self.aRnnInputEval].shape[0] ):
            #     print "\n test data na ", i, scores[0, i]
            #     print self.IdToWord(batchData[self.aRnnInputEval][i, :].tolist() )
            #     print self.IdToWord(batchData[self.cRnnInputEval][i, :].tolist() )
            # # raw_input("done")

            # print self.IdToWord(paRnnInput[np.argmax(scores), :].tolist() )
            # print self.IdToWord(pcRnnInput[np.argmax(scores), :].tolist() )
            # print self.IdToWord(qRnnInput[0, :].tolist() )

            # predict a topK list
            predIdSort = np.argsort(-scores[0, :])
            prediction[qaId] = list()
            for i in range(min(topK, scores.size)):
                predId = predIdSort[i]
                if self.articleLevel == False:
                    # from paragraph level span id to article level span id
                    globalId = [
                        idx for idSen in candGlobalId[title][sample.pAnsParaId]
                        for idx in idSen
                    ]
                    predId = globalId[predId]
                predInfo = candData[title].candidateAnswers[predId]
                predParaId = predInfo.paragraphIndex
                predSenId = predInfo.sentenceIndex
                predSpanStart = predInfo.spanBeginIndex
                predSpanEnd = predInfo.spanBeginIndex + predInfo.spanLength
                tokens = origData[title].paragraphs[
                    predParaId].context.sentence[predSenId].token[
                        predSpanStart:predSpanEnd]
                predStr = ReconstructStrFromSpan(tokens, (0, len(tokens)))
                prediction[qaId].append(
                    QaPrediction(title,
                                 qaId,
                                 predStr,
                                 predParaId,
                                 predSenId,
                                 ansToken=tokens))
        return prediction