예제 #1
0
파일: QA.py 프로젝트: jinpoon/11611_project
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.dropType = {}
     self.typeNer = {}
     self.typePro = {}
     self.initQstType()
     self.candidateAnswer = []
     self.candidateSentence = []
     self.qgPipeline = QGPipeline()
     self.threshold = 90
예제 #2
0
def When_module(sent, sent_features):
    question = []
    structures = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()

    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    when_parseTraversal(sent, parse, question, structures)
    # print(question)
    # print(structures)
    prev_min = float('Inf')

    if len(structures) > 0:
        whenPhrase = ""
        for t in structures:
            if t[1] < prev_min:
                whenPhrase = t[0]
                prev_min = t[1]
        # print(sent)
        # print(whenPhrase)
        thisQ = sent.replace(whenPhrase, "")
        dep_tree = sNLP.dependency_parse(thisQ)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(thisQ, dep_tree_list)

    for q in question:
        dep_tree = sNLP.dependency_parse(q)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(q, dep_tree_list)

    # print()

    pass
예제 #3
0
 def __init__(self, dataFile):
     self.sNLP = StanfordNLP()
     self.punc = {'.', '?', '!', '\n'}
     # self.textData = open(dataFile,"r", encoding='utf-8').read()
     self.textData = dataFile
     self.textData = self.preProcessText(self.textData)
     self.sentence_list = []
     # self.getSentenceList(self.textData)
     self.tokenizePara(self.textData)
예제 #4
0
def Where_Which_module(sent, sent_features):
    question = []
    simple_ques = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()
    #
    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    where_which_inFirstPP(sent, parse, simple_ques)
    if len(simple_ques) > 0:
        for bool, thisSent, nerSet, thisPP in simple_ques:
            dep_tree = sNLP.dependency_parse(thisSent)
            dep_tree = dep_tree.__next__()
            dep_tree_list = list(dep_tree.triples())
            # for t in dep_tree_list:
            #     print(t)
            if bool:
                case = thisPP.split(" ")[0]
                type = ""
                if "COUNTRY" in nerSet:
                    type = "country"
                elif "LOCATION" in nerSet:
                    type = "location"
                elif "CITY" in nerSet:
                    type = "city"
                else:
                    type = "place"
                return([construct_where_which(thisSent, dep_tree_list,case,type)])
            else:
                where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question)
                return(question)
예제 #5
0
class QGPipeline:

    # SENTENCE SIMPLIFICATION
    ### removing parenthetical phrases
    # print(text)
    def __init__(self):
        self.sNLP = StanfordNLP()
        self.sent_simpl = Simplification()
        self.QG = QuestionGeneration()

    def getParseTree(self, text):
        # text = re.sub("\(.*\)", "", text)
        # text = re.sub("\\n", "", text)
        t = self.sNLP.parse(text)
        # print("Parse:", t)
        return (t)

    def splitConj(self, t):
        # STEP 1: split on conjunctions
        t_list = []
        t_list = self.sent_simpl.splitConjunctions(t, t_list, None)

        if len(t_list) == 0:
            t_list.append(t)

        return (t_list)

    # Simplify split parent sentences
    def simplify_sentence(self, t_list):
        simplified_sentences = []
        for this in t_list:
            processed_text = " ".join(self.sent_simpl.traversalAndSimplification(this))
            processed_text = processed_text.replace(",", "")
            processed_text = re.sub(' +', ' ', processed_text).strip()
            if processed_text[-1] != '.':
                processed_text += ' .'
            simplified_sentences.append(processed_text)

        return (simplified_sentences)

    # print("Simplified Sentences...")
    # print(simplified_sentences)

    #### Question generation
    def QuesGen(self, simplified_sentences):
        final_q_list = []

        for this in simplified_sentences:
            final_q_list.extend(self.QG.QG(this))
        # print("Questions...")
        return (final_q_list)
예제 #6
0
def categorizeQs(sents, sent_to_Q_dict):
    # print(sents)
    sent_features = {}
    sNLP = StanfordNLP()
    normal_ners = sNLP.ner(sents)
    normal_ner_set = {t[1] for t in normal_ners}

    aux_Flag = max([1 if w in sents else 0 for w in aux_words])
    # print(aux_Flag)
    if aux_Flag == 1:
        thisQues = Binary_QG.bin_question(sents)
        for p_b in thisQues:
            if p_b is not None:
                sent_to_Q_dict["Binary"].append((sents, p_b))

    why_flag = max([1 if w in sents else 0 for w in why_keys])
    # print(why_flag)
    if why_flag == 1:
        thisQues = Why_QG.why_q(sents)
        if thisQues is not None:
            sent_to_Q_dict["Why"].append((sents, thisQues))

    thisQues = What_Who_QG.What_Who_module(sents)
    for p_ in thisQues:
        if p_ is not None:
            sent_to_Q_dict["What_Who"].append((sents, p_))

    if 'LOCATION' in normal_ner_set or 'COUNTRY' in normal_ner_set or 'CITY' in normal_ner_set:
        thisQ = Where_Which_QG.Where_Which_module(sents, sent_features)
        for p in thisQ:
            if p is not None:
                sent_to_Q_dict["Where_Which"].append((sents, p))

    if 'DATE' in normal_ner_set or 'TIME' in normal_ner_set:
        thisQ = When_QG.When_module(sents, sent_features)
        if thisQ is not None:
            sent_to_Q_dict["When"].append((sents, thisQ))
예제 #7
0
def why_q(sents):
    # preprocessing

    sNLP = StanfordNLP()

    parse = sNLP.parse(sents)

    sents = What_Who_QG.remove_modifiers(parse)

    # print("remove modifiers", sents)

    tokenized_sentences = []
    question = ""

    tokenized_sentences.append(word_tokenize(sents))
    q_set = []
    for sent in tokenized_sentences:
        pos_tags = nltk.pos_tag(sent)
        # print(pos_tags)
        if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'):
            pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1])
        q_list = copy.deepcopy(pos_tags)
        q_string = ''
        #print(pos_tags)

        for i in range(len(pos_tags)):
            if pos_tags[i][1] == 'VBD':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD')
                q_list.insert(0, ('Why did', 0))
                break
            elif pos_tags[i][1] == 'VBZ':
                if pos_tags[i][0] in aux_words:
                    q_list.insert(0, q_list.pop(i))
                    q_list.insert(0, ("Why", 0))
                else:
                    q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ")
                    if q_list[i][0] == "do": q_list.pop(i)
                    q_list.insert(0, ("Why does", 0))
                break
            elif pos_tags[i][1] == 'VBP':
                q_list.insert(0, q_list.pop(i))
                q_list.insert(0, ("Why", 0))
                break
        replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:]
        q_list[0] = (replace_string, 0)
        #print(q_list)

        question = ' '.join([i[0] for i in q_list])
        ind = -1
        for k in why_keys:
            if question.find(k) != -1:
                ind = question.find(k)
                break
        if ind != -1:
            question = question[:ind - 1]
        question = question + "?"
        # print(question)

    if question != "":
        return (question)
    else:
        return None
예제 #8
0
파일: QA.py 프로젝트: jinpoon/11611_project
class QA():
    def __init__(self):
        self.sNLP = StanfordNLP()
        self.dropType = {}
        self.typeNer = {}
        self.typePro = {}
        self.initQstType()
        self.candidateAnswer = []
        self.candidateSentence = []
        self.qgPipeline = QGPipeline()
        self.threshold = 90

    def initQstType(self):
        self.typeSet = ['WHADJP', 'WHADVP', 'WHPP', 'WHAVP', 'WHNP']
        self.dropType['WHADJP'] = ['NP', 'CD']
        self.dropType['WHAVP'] = ['PP', 'SBAR']
        self.dropType['WHADVP'] = ['PP', 'SBAR']
        self.dropType['WHPP'] = ['PP']
        self.dropType['WHNP'] = ['NP']
        self.dropType['UK'] = ['NP', 'NN']
        self.auxWord = ['did', 'do', 'does', 'is', 'are', 'were', 'was']
        self.typePro['where'] = ['in', 'at', 'on', 'behind', 'next']
        self.typeNer['when'] = ['DATE']
        self.typeNer['where'] = [
            'CITY', 'STATE_OR_PROVINCE', 'ORGANIZATION', 'LOCATION', 'COUNTRY'
        ]

    def decideType(self, myParent):
        if self.qstFlag:
            return
        for node in myParent:
            #node.pretty_print()
            if self.qstFlag:
                return

            if isinstance(node, str): continue

            if node.label() in self.typeSet:
                self.thisType = node.label()
                myParent.remove(node)
                self.qstFlag = True
            self.decideType(node)
            if node.label() == 'ROOT':
                self.qstSim = node.leaves()
                self.qstSim = ' '.join(self.qstSim[:-1])

    def parseDep(self, x):
        a = x[0][0].lower()
        a = WordNetLemmatizer().lemmatize(a)
        b = x[2][0].lower()
        b = WordNetLemmatizer().lemmatize(b)
        return (a, b)

    def bin_answer(self, question, sent):
        #print(question, sent)

        qstTree = self.sNLP.dependency_parse(question)
        qstTree = qstTree.__next__()
        qstTree = list(qstTree.triples())
        sentTree = self.sNLP.dependency_parse(sent)
        sentTree = sentTree.__next__()
        sentTree = list(sentTree.triples())
        #print(qstTree, sentTree)
        qstSub = []
        sentSub = []
        flag = False
        neg = False
        for x in qstTree:
            # print(x)
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                qstSub.append(self.parseDep(x))
            if x[1] == 'neg':
                neg = True
        for x in sentTree:
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                sentSub.append(self.parseDep(x))
                if self.parseDep(x) in qstSub:
                    flag = True
        #print(qstSub)
        #print(sentSub)

        if flag:
            if neg:
                return ('No', 100)
            else:
                return ('Yes', 100)

        bin_tags = set(
            ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has'])
        question = question.lower()
        sent = sent.lower()
        q_tokens = word_tokenize(question)
        s_tokens = word_tokenize(sent)
        negations = set(['not', 'never', "aren't"])
        ans = ''
        # case 1: negations
        for neg in negations:
            if (neg in q_tokens) and (neg not in s_tokens):
                if ans == "No":
                    ans = "Yes"
                else:
                    ans = "No"
            if (neg in q_tokens) and (neg in s_tokens):
                if ans == "Yes":
                    ans = "No"
                else:
                    ans = "Yes"
        # case 2: similarity
        sim = fuzz.partial_ratio(question, sent)
        if sim > 90:
            ans = "Yes"
        else:
            ans = "No"
        return (ans, sim)

    def qstType(self, qst):
        self.thisType = 'UK'
        self.qstFlag = False
        self.qstSim = None

        tree = self.sNLP.parser_sents([
            qst,
        ])
        for i in tree:
            self.decideType(i)

    def fitness(self, txt, qst):
        self.qstType(qst)
        if self.thisType == 'UK':
            _, sim = self.bin_answer(qst, txt)
            return sim > self.threshold
        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in [txt]:
            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)
            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_ans = '_'
        best_candi = None
        best_sen = None

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                best_dis = score
                best_sen = nowSentence
                best_ans = this_ans

        return self.threshold < best_dis

    def dropFragment(self, myParent, qstType):
        flag = 0
        for node in myParent:
            if isinstance(node, str): continue
            if self.dropTime > self.dropTotal:
                return
            if node.label() in self.dropType[qstType]:
                self.dropTime += 1
                if self.dropTime > self.dropTotal:
                    myParent.remove(node)
                    self.candidateAnswer.append(node.leaves())
                    self.findFlag = 1
                    return
            self.dropFragment(node, qstType)
            if node.label() == 'ROOT' and self.findFlag:
                # print(node.leaves())
                self.candidateSentence.append(node.leaves())

    def findFragment(self, myParent, qstType):
        for node in myParent:
            if isinstance(node, str): continue
            # node.pretty_print()
            if node.label() in self.dropType[qstType]:
                self.candidateAnswer.append((node.leaves(), node.label()))

            self.findFragment(node, qstType)

    def answerSpecial(self, txtList, tokens, qstType):
        # print(tokens[0])
        self.candidateAnswer = []
        self.finalAnswer = []
        self.candidateSentence = []
        for txt in txtList:
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.findFragment(i, qstType)
        for i in self.candidateAnswer:
            sentence = ' '.join(i[0])
            pos_tag = self.sNLP.ner(sentence)
            print(pos_tag)
            if pos_tag[1][1] in self.typeNer[qstType]:
                # print(pos_tag)
                self.finalAnswer.append(sentence)
        print(self.finalAnswer[0])

    def preProcessText(self, text):
        data = re.sub("\(.*\)", "", text)
        data = re.sub(' +', ' ', data).strip()
        return data

    def answer(self, txtList, qst):
        self.head = word_tokenize(qst)[0].lower()

        self.qstType(qst)
        if self.thisType == 'UK':

            best_score = 0
            best_ans = 'Yes'
            best_sent = '_'
            for txt in txtList:
                ans, sim = self.bin_answer(qst, txt)
                if sim > best_score:
                    best_ans = ans
                    best_score = sim
                    best_sent = txt
            #print('=======')
            #print(best_sent)
            #print(qst)
            print(best_ans + '.')
            #print(best_score)
            #print('=======')
            return

        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in txtList:
            thisSent = self.preProcessText(thisSent)
            if (len(word_tokenize(thisSent)) < 4
                    or len(word_tokenize(thisSent)) > 25):
                continue

            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)

            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_candi = None
        best_sen = None
        best_ans = '_'

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            # print(nowSentence)
            # print(self.qstSim)
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            # print(score)
            # print('----------')

            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                if self.head == 'who':
                    ners = getExhaustiveNERs(this_ans)
                    #print(this_ans, ners[0])
                    if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[
                            0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'when':
                    ners = getExhaustiveNERs(this_ans)
                    if 'DATE' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'where':
                    ners = getExhaustiveNERs(this_ans)
                    if 'LOCATION' not in ners[0] and 'CITY' not in ners[
                            0] and 'ORGANIZATION' not in ners[
                                0] and 'STATE_OR_PROVINCE' not in ners[
                                    0] and 'COUNTRY' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                best_dis = score

                best_sen = nowSentence
                best_ans = this_ans

        #print('++++++++++++++++++')
        #print(qst)
        #print(best_dis)
        #print(best_sen)
        if best_ans == '_':
            print('I cannot answer that question: ' + qst)
        else:
            print(best_ans.capitalize() + '.')
        #print('++++++++++++++++++')

    def edit_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.edit_distance(s2, s1)
        # len(s1) >= len(s2)
        if len(s2) == 0:
            return len(s1)
        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            c1 = c1.lower()
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                c2 = c2.lower()
                insertions = previous_row[
                    j +
                    1] + 1  # j+1 instead of j since previous_row and current_row are one character longer
                deletions = current_row[j] + 1  # than s2
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]

        for i in range(len(self.candidateSentence)):
            nowSentence = self.candidateSentence[i]

            score = self.edit_distance(nowSentence, tokens)
            best_candi = ' '.join(nowSentence)
            this_ans = ' '.join(self.candidateAnswer[i])
            if (score < best_dis
                    or (score == best_dis and len(this_ans) < len(best_ans))):
                best_dis = score
                best_ans = this_ans
        return best_dis
예제 #9
0
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.sent_simpl = Simplification()
     self.QG = QuestionGeneration()
예제 #10
0
 def __init__(self):
     self.sNLP = StanfordNLP()
예제 #11
0
class Simplification:
    ##### Split on conjunctions and some more simplifications in the complex parent sentence

    def __init__(self):
        self.sNLP = StanfordNLP()

    def coref_resolve(self, passage):
        text = "John is a tennis player. He has an awesome physique."
        result = json.loads(self.sNLP.coref(text))
        # print()
        # print(result['sentences'])
        num, mentions = list(result['corefs'].items())[0]

    def splitConjunctions(self, parent, list_sents, firstPP):
        firstFlag = 0
        for node in parent:
            if type(node) is ParentedTree:
                if node.label() == 'ROOT':
                    pass
                else:
                    # print("Label:", node.label())
                    # print("Leaves:", node.leaves())
                    # print("Parent:", node.parent().label())
                    # print(node.left_sibling())
                    # print(node.label())
                    # print(node.right_sibling())
                    # print(node.parent().label())
                    # print(len(node))

                    # if the starting of the sentence is with a Prepositional phrase move it to the last child of each verb phrase seen
                    if node.label() == "PP" and node.parent().label(
                    ) == "S" and node.left_sibling() is None:
                        parent.remove(node)
                        firstPP = node  # make a copy
                        firstFlag = 1
                        # print("firstPP parent", firstPP.parent())
                        # print(firstPP)
                    # for each  VP insert the extracted PP as the last child

                    elif node.label(
                    ) == "VP" and firstPP is not None and node.parent().label(
                    ) == 'S':
                        # print("Im inside firstPP")
                        # node.pretty_print()
                        # firstPP.pretty_print()
                        # print(firstPP.parent())
                        # if node.right_sibling().label() == '.' or node.right_sibling().label() == 'CC':
                        node.insert(len(node), firstPP)
                        # node.pretty_print()

                    #### split on conjunctions iff left and right siblings of CC except (or, nor) are S
                    if node.label() == "CC" and node.leaves() not in ('or', 'Or', 'nor', 'Nor') and \
                            node.left_sibling() is not None and node.right_sibling() is not None:
                        # print("Im here")
                        # print(node.left_sibling().label())
                        # print(node.right_sibling().label())
                        if node.left_sibling().label() == "S":
                            list_sents.append(node.left_sibling())
                        if node.right_sibling().label() == "S":
                            list_sents.append(node.right_sibling())

                if node.parent() is not None:
                    if firstFlag:
                        firstPP_temp = firstPP.copy(
                            deep=True
                        )  # maintain a copy of the first PP found through out the recursion
                    else:
                        firstPP_temp = None
                    self.splitConjunctions(node, list_sents, firstPP_temp)
        return list_sents

    ####### traverse and simplify sentence

    def traversalAndSimplification(self, parent):
        for node in parent:
            if type(node) is ParentedTree:
                if node == None:
                    continue
                if node.label() == 'ROOT':
                    pass
                else:
                    if node.label() in ("ADVP", "SBAR", "SBARQ"):
                        parent.remove(node)
                    elif node.label() == 'PP' and node.parent().label(
                    ) == "VP" and node.left_sibling().label() == ',':
                        parent.remove(node)
                    elif node.parent().label() == 'NP' and node.left_sibling(
                    ) is not None and node.right_sibling() is not None:
                        if node.left_sibling().label(
                        ) == ',' and node.right_sibling().label() == ',':

                            parent.remove(node)
                            # print(parent.leaves())

                if node.parent(
                ) is not None:  ### recursive to go in depth of the tree
                    self.traversalAndSimplification(node)
            # else:
            # print("Word:", node)pass
        return (parent.leaves())
예제 #12
0
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.beVerbs = {"am", "is", "are", "was", "were"}
예제 #13
0
class QuestionGeneration:
    def __init__(self):
        self.sNLP = StanfordNLP()
        self.beVerbs = {"am", "is", "are", "was", "were"}
        # self.aux_verbs = {'is', 'were', 'can', 'could', }

    def auxilaryWord(self, sub, POS_tag):
        # TODO lowercase
        # TODO will may...
        # TODO plural...
        # Jerry and I
        if sub.lower() in ('i', 'they', 'you'):
            return 'do'
        if sub.lower() in ('he', 'she'):
            return 'does'

    def beWork(self, sentence):
        # pos = nltk.pos_tag(sentence)
        j = None
        for i in range(len(sentence) - 1):
            if sentence[i] in self.beVerbs:
                j = i
                break
        if j is not None:
            temp = sentence[j]
            sentence.pop(j)
            sentence.insert(0, temp)
            #print(sentence)
            return sentence

        return

    # def getNounandVerbOfSentence(self, sentence):

    def QG(self, text):

        dep_parse_Tree = self.sNLP.dependency_parse(text)
        dep_parse_Tree = dep_parse_Tree.__next__()
        Ques_list = []

        # Yes or No question

        be_question = self.beWork(text)
        if be_question is not None:
            be_question += '?'
            Ques_list.append(be_question)

        # WHO question for Subject

        # create NER tags
        ner_tags = dict(self.sNLP.ner(text))
        pos_tag = self.sNLP.pos(text)
        #print(ner_tags)

        # get triples list of the dependency tree
        triples_list = list(dep_parse_Tree.triples())
        #print(triples_list)
        ##### LOOP THRU DEPENDENCY TREE AND CREATE QUESTIONS
        auxWord = 'xxx'
        for this in triples_list:
            # print(this)
            temp_text = '?'

            # for the subject question
            if this[1] in ['nsubj', 'csubj', 'nsubjpass']:
                subject = None
                sub_pos = None
                # in order of preference
                if this[2][1] in ['NNP', 'NNPS', 'PRP']:
                    subject = this[2][0]
                    sub_pos = this[2][1]
                elif this[0][1] in ['NNP', 'NNPS']:
                    subject = this[0][0]
                    sub_pos = this[0][1]
                elif this[2][1] in ['NN', 'NNS']:
                    subject = this[2][0]
                    sub_pos = this[2][1]

                #print("sub", subject)
                if subject is not None:  # need to add sub_pos
                    auxWord = self.auxilaryWord(subject, sub_pos)

                    if ner_tags[subject] in ['PERSON', 'TITLE', 'MISC'
                                             ]:  # check if its a PERSON NER
                        temp_text = self.contructQ(triples_list, subject, text,
                                                   None)
                        temp_text = temp_text.replace(subject, "Who").replace(
                            " .", "?")  # create question

                        # some string manipulation to get the ?
                        if "?" not in temp_text:
                            temp_text = temp_text + "?"
                            # print(text.replace(subject, "Who").replace(" .", "?"))

                    if ner_tags[
                            subject] == 'ORGANIZATION':  # if the subject is ORG
                        temp_text = text.replace(subject,
                                                 "Which organization").replace(
                                                     " .", "?")

                    if ner_tags[subject] == 'CITY':  # if the subject is CITY
                        temp_text = text.replace(subject,
                                                 "Which city").replace(
                                                     " .", "?")

                    if ner_tags[
                            subject] == 'COUNTRY':  # if the subject is CITY
                        temp_text = text.replace(subject,
                                                 "Which country").replace(
                                                     " .", "?")

                    if this[2][1] in ['PRP']:  # if the subject is preposition
                        temp_text = text.replace(subject,
                                                 "Who").replace(" .", "?")

                    if ner_tags[subject] in [
                            'O', 'LOCATION'
                    ] and temp_text == '?':  # if the subject is Other
                        temp_text = self.contructQ(triples_list, subject, text,
                                                   None)
                        if sub_pos == 'PRP' and subject.lower() in [
                                'they', 'he', 'she'
                        ]:
                            temp_text = temp_text.replace(subject,
                                                          "Who").replace(
                                                              " .", "?")
                        else:
                            temp_text = temp_text.replace(subject,
                                                          "What").replace(
                                                              " .", "?")

            # for number, How many questions
            elif this[1] in ['nummod']:
                numPhrase = this[2][0] + ' ' + this[0][0]
                targetWord = this[2][0]
                if ner_tags[targetWord] in ('NUMBERS'):
                    temp_text = text.replace(numPhrase, "").replace(" .", "?")
                    temp_text = "How many " + this[0][0] + " " + (
                        auxWord
                        if auxWord is not None else "") + " " + temp_text

            # for possessive questions
            elif this[1] in ['nmod:poss']:
                if this[2][1] in ['NNP']:
                    # if this[2][0][-1] == 's':
                    #     poss_word = this[2][0]
                    # else:
                    poss_word = this[2][0]  #+ " 's"
                    temp_text = self.contructQ(triples_list, this[2][0], text,
                                               None)
                    temp_text = temp_text.replace(poss_word, "Whose").replace(
                        " .", "?").replace("'s", "").replace(" '", "")
                    if not temp_text.startswith("Whose"):
                        temp_text = temp_text.replace("Whose",
                                                      "whose").replace(
                                                          " '", "")

            # for prop questions
            elif this[1] in ('case'):
                subject = this[0][0]
                propPhrase = this[2][0] + ' ' + this[0][0]
                # print(propPhrase)
                if ner_tags[subject] in ['CITY']:  # where
                    temp_text = text.replace(propPhrase, "").replace(
                        " .", "?")  # create question
                    temp_text = "Where " + (auxWord if auxWord is not None else
                                            "") + " " + temp_text
                    # some string manipulation to get the ?
                if ner_tags[subject] in ['DATE']:  # when
                    temp_text = text.replace(propPhrase, "").replace(" .", "?")
                    # print(auxWord, temp_text)
                    temp_text = "When " + (auxWord if auxWord is not None else
                                           "") + " " + temp_text

            elif this[1] in ('iobj', 'dobj'):
                # code to be written for questions on direct and indirect Objects
                pass
            #### endif

            if "?" not in temp_text:
                temp_text = temp_text + "?"
            if temp_text != '?':
                # print(temp_text)
                Ques_list.append(temp_text)

        return (Ques_list)

    #### in case of the subject has modifiers or the Subject is a part of a long NP remove all the related modifiers of the subject with the help of dependency tree
    #### same to be replicated for Object as well
    def contructQ(self, list_triples, subject, text, object):

        if subject is not None:
            text = text[text.find(
                subject
            ):]  ## removing unnecessary determinants (a, the, An) by slicing off until the subject word
            # print(text)
            dict_of_words_removed = {
            }  # subject related word removal to construct a question
            for thisTriple in list_triples:  ## loop thru dependency tree
                if thisTriple[0][0] == subject or thisTriple[0][
                        0] in dict_of_words_removed:
                    if thisTriple[1] not in ['nsubj', 'csubj']:
                        if (thisTriple[2][0]).lower() not in [
                                'the', 'a', 'an'
                        ]:  # skipping determinants as they can be present in other places of the sentence as well
                            text = re.sub(
                                ' +',
                                ' ', text.replace(thisTriple[2][0], '')).strip(
                                )  # removing subject related words
                            dict_of_words_removed[thisTriple[2][
                                0]] = 0  # adding the removed word so that other words that are connected to this can also be removed

        return (text)
예제 #14
0
def getNerSet(phrase):
    sNLP = StanfordNLP()
    return {t[1] for t in sNLP.ner(phrase)}
예제 #15
0
def bin_question(sents):
    # preprocessing
    # text_file = sys.argv[1]
    # sentences = []
    # with io.open(text_file, 'r', encoding='utf-8') as f:
    #     for line in f:
    #         line = line.strip()
    #         sentences.extend(sent_tokenize(line))
    # # tagging
    # tokenized_sentences = [word_tokenize(i) for i in sentences if
    #                        (len(word_tokenize(i)) > 5) and (len(word_tokenize(i)) < 25)]

    sNLP = StanfordNLP()

    parse = sNLP.parse(sents)

    sents = What_Who_QG.remove_modifiers(parse)

    # print("remove modifiers", sents)

    tokenized_sentences = []
    tokenized_sentences.append(word_tokenize(sents))

    # print("TOKE", tokenized_sentences)
    aux_words = ['are', 'was', 'were', 'is', 'have', 'has']
    aux_words = set(aux_words)
    question_set = []
    # c = 0
    for sent in tokenized_sentences:
        pos_tags = nltk.pos_tag(sent)
        # print(pos_tags)

        if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'):
            pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1])
        q_list = copy.deepcopy(pos_tags)
        q_string = ''
        for i in range(len(pos_tags)):
            if pos_tags[i][0] in aux_words:
                q_list.insert(0, q_list.pop(i))
                break
            elif pos_tags[i][1] == 'VBD':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD')
                q_list.insert(0, ('Did', 0))
                break
            elif pos_tags[i][1] == 'VBZ':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ")
                q_list.insert(0, ("Does", 0))
                # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v')
                break
            elif pos_tags[i][1] == 'VBP':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBP")
                q_list.insert(0, ("Do", 0))
                # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v')
                break
        if q_list[0][0].lower() in [
                'are', 'was', 'were', 'is', 'have', 'has', 'did', 'do', 'does'
        ]:
            replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:]
            q_list[0] = (replace_string, 0)
            question = ' '.join([i[0] for i in q_list])
            question = question[:-2]
            question = question + "?"

            question_set.append(question)

    # print(question_set)

    return question_set
예제 #16
0
#     for i in dep_tree:
#         if i[1] in ['nsubj', 'csubj', 'nsubjpass']:
#             return(i[0][0], i[0][1])
#     return (None,None)

# def findAuxVerb(dep_tree, verb):
#     aux = ""
#     mod = ""
#     for i in dep_tree:
#         if i[0][0] == verb and i[1] in ["auxpass", "aux"]:
#             aux += i[2][0]+" "
#         if i[0][0] == verb and i[1] in ["adv", "advmod"]:
#             mod += i[2][0] + " "
#     return (aux, mod, verb)

sNLP = StanfordNLP()


def getDecapitalized(sentence):
    tokens = sNLP.word_tokenize(sentence)
    first = tokens[0]
    # print(first)
    thisNER = sNLP.ner(sentence)
    # print(thisNER)
    if thisNER[0][1] not in [
            'PERSON', 'LOCATION', 'ORGANIZATION', 'CITY', 'NATIONALITY',
            'COUNTRY', 'TIME'
    ]:
        first = first.lower()
    return first + " " + " ".join(tokens[1:])