def __init__(self): self.sNLP = StanfordNLP() self.dropType = {} self.typeNer = {} self.typePro = {} self.initQstType() self.candidateAnswer = [] self.candidateSentence = [] self.qgPipeline = QGPipeline() self.threshold = 90
def When_module(sent, sent_features): question = [] structures = [] sNLP = StanfordNLP() # print(sent_features) # dep_parse = sNLP.dependency_parse(sent) # dep_parse = dep_parse.__next__() # # dep_parse_list = list(dep_parse.triples()) parse = sNLP.parse(sent) # parse.pretty_print() # for t in dep_parse_list: # print(t) # print(sNLP.ner(sent)) # print(sNLP.pos(sent)) when_parseTraversal(sent, parse, question, structures) # print(question) # print(structures) prev_min = float('Inf') if len(structures) > 0: whenPhrase = "" for t in structures: if t[1] < prev_min: whenPhrase = t[0] prev_min = t[1] # print(sent) # print(whenPhrase) thisQ = sent.replace(whenPhrase, "") dep_tree = sNLP.dependency_parse(thisQ) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) return construct_when(thisQ, dep_tree_list) for q in question: dep_tree = sNLP.dependency_parse(q) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) return construct_when(q, dep_tree_list) # print() pass
def __init__(self, dataFile): self.sNLP = StanfordNLP() self.punc = {'.', '?', '!', '\n'} # self.textData = open(dataFile,"r", encoding='utf-8').read() self.textData = dataFile self.textData = self.preProcessText(self.textData) self.sentence_list = [] # self.getSentenceList(self.textData) self.tokenizePara(self.textData)
def Where_Which_module(sent, sent_features): question = [] simple_ques = [] sNLP = StanfordNLP() # print(sent_features) # dep_parse = sNLP.dependency_parse(sent) # dep_parse = dep_parse.__next__() # # dep_parse_list = list(dep_parse.triples()) parse = sNLP.parse(sent) # parse.pretty_print() # # for t in dep_parse_list: # print(t) # print(sNLP.ner(sent)) # print(sNLP.pos(sent)) where_which_inFirstPP(sent, parse, simple_ques) if len(simple_ques) > 0: for bool, thisSent, nerSet, thisPP in simple_ques: dep_tree = sNLP.dependency_parse(thisSent) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) if bool: case = thisPP.split(" ")[0] type = "" if "COUNTRY" in nerSet: type = "country" elif "LOCATION" in nerSet: type = "location" elif "CITY" in nerSet: type = "city" else: type = "place" return([construct_where_which(thisSent, dep_tree_list,case,type)]) else: where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question) return(question)
class QGPipeline: # SENTENCE SIMPLIFICATION ### removing parenthetical phrases # print(text) def __init__(self): self.sNLP = StanfordNLP() self.sent_simpl = Simplification() self.QG = QuestionGeneration() def getParseTree(self, text): # text = re.sub("\(.*\)", "", text) # text = re.sub("\\n", "", text) t = self.sNLP.parse(text) # print("Parse:", t) return (t) def splitConj(self, t): # STEP 1: split on conjunctions t_list = [] t_list = self.sent_simpl.splitConjunctions(t, t_list, None) if len(t_list) == 0: t_list.append(t) return (t_list) # Simplify split parent sentences def simplify_sentence(self, t_list): simplified_sentences = [] for this in t_list: processed_text = " ".join(self.sent_simpl.traversalAndSimplification(this)) processed_text = processed_text.replace(",", "") processed_text = re.sub(' +', ' ', processed_text).strip() if processed_text[-1] != '.': processed_text += ' .' simplified_sentences.append(processed_text) return (simplified_sentences) # print("Simplified Sentences...") # print(simplified_sentences) #### Question generation def QuesGen(self, simplified_sentences): final_q_list = [] for this in simplified_sentences: final_q_list.extend(self.QG.QG(this)) # print("Questions...") return (final_q_list)
def categorizeQs(sents, sent_to_Q_dict): # print(sents) sent_features = {} sNLP = StanfordNLP() normal_ners = sNLP.ner(sents) normal_ner_set = {t[1] for t in normal_ners} aux_Flag = max([1 if w in sents else 0 for w in aux_words]) # print(aux_Flag) if aux_Flag == 1: thisQues = Binary_QG.bin_question(sents) for p_b in thisQues: if p_b is not None: sent_to_Q_dict["Binary"].append((sents, p_b)) why_flag = max([1 if w in sents else 0 for w in why_keys]) # print(why_flag) if why_flag == 1: thisQues = Why_QG.why_q(sents) if thisQues is not None: sent_to_Q_dict["Why"].append((sents, thisQues)) thisQues = What_Who_QG.What_Who_module(sents) for p_ in thisQues: if p_ is not None: sent_to_Q_dict["What_Who"].append((sents, p_)) if 'LOCATION' in normal_ner_set or 'COUNTRY' in normal_ner_set or 'CITY' in normal_ner_set: thisQ = Where_Which_QG.Where_Which_module(sents, sent_features) for p in thisQ: if p is not None: sent_to_Q_dict["Where_Which"].append((sents, p)) if 'DATE' in normal_ner_set or 'TIME' in normal_ner_set: thisQ = When_QG.When_module(sents, sent_features) if thisQ is not None: sent_to_Q_dict["When"].append((sents, thisQ))
def why_q(sents): # preprocessing sNLP = StanfordNLP() parse = sNLP.parse(sents) sents = What_Who_QG.remove_modifiers(parse) # print("remove modifiers", sents) tokenized_sentences = [] question = "" tokenized_sentences.append(word_tokenize(sents)) q_set = [] for sent in tokenized_sentences: pos_tags = nltk.pos_tag(sent) # print(pos_tags) if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'): pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1]) q_list = copy.deepcopy(pos_tags) q_string = '' #print(pos_tags) for i in range(len(pos_tags)): if pos_tags[i][1] == 'VBD': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD') q_list.insert(0, ('Why did', 0)) break elif pos_tags[i][1] == 'VBZ': if pos_tags[i][0] in aux_words: q_list.insert(0, q_list.pop(i)) q_list.insert(0, ("Why", 0)) else: q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ") if q_list[i][0] == "do": q_list.pop(i) q_list.insert(0, ("Why does", 0)) break elif pos_tags[i][1] == 'VBP': q_list.insert(0, q_list.pop(i)) q_list.insert(0, ("Why", 0)) break replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:] q_list[0] = (replace_string, 0) #print(q_list) question = ' '.join([i[0] for i in q_list]) ind = -1 for k in why_keys: if question.find(k) != -1: ind = question.find(k) break if ind != -1: question = question[:ind - 1] question = question + "?" # print(question) if question != "": return (question) else: return None
class QA(): def __init__(self): self.sNLP = StanfordNLP() self.dropType = {} self.typeNer = {} self.typePro = {} self.initQstType() self.candidateAnswer = [] self.candidateSentence = [] self.qgPipeline = QGPipeline() self.threshold = 90 def initQstType(self): self.typeSet = ['WHADJP', 'WHADVP', 'WHPP', 'WHAVP', 'WHNP'] self.dropType['WHADJP'] = ['NP', 'CD'] self.dropType['WHAVP'] = ['PP', 'SBAR'] self.dropType['WHADVP'] = ['PP', 'SBAR'] self.dropType['WHPP'] = ['PP'] self.dropType['WHNP'] = ['NP'] self.dropType['UK'] = ['NP', 'NN'] self.auxWord = ['did', 'do', 'does', 'is', 'are', 'were', 'was'] self.typePro['where'] = ['in', 'at', 'on', 'behind', 'next'] self.typeNer['when'] = ['DATE'] self.typeNer['where'] = [ 'CITY', 'STATE_OR_PROVINCE', 'ORGANIZATION', 'LOCATION', 'COUNTRY' ] def decideType(self, myParent): if self.qstFlag: return for node in myParent: #node.pretty_print() if self.qstFlag: return if isinstance(node, str): continue if node.label() in self.typeSet: self.thisType = node.label() myParent.remove(node) self.qstFlag = True self.decideType(node) if node.label() == 'ROOT': self.qstSim = node.leaves() self.qstSim = ' '.join(self.qstSim[:-1]) def parseDep(self, x): a = x[0][0].lower() a = WordNetLemmatizer().lemmatize(a) b = x[2][0].lower() b = WordNetLemmatizer().lemmatize(b) return (a, b) def bin_answer(self, question, sent): #print(question, sent) qstTree = self.sNLP.dependency_parse(question) qstTree = qstTree.__next__() qstTree = list(qstTree.triples()) sentTree = self.sNLP.dependency_parse(sent) sentTree = sentTree.__next__() sentTree = list(sentTree.triples()) #print(qstTree, sentTree) qstSub = [] sentSub = [] flag = False neg = False for x in qstTree: # print(x) if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: qstSub.append(self.parseDep(x)) if x[1] == 'neg': neg = True for x in sentTree: if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: sentSub.append(self.parseDep(x)) if self.parseDep(x) in qstSub: flag = True #print(qstSub) #print(sentSub) if flag: if neg: return ('No', 100) else: return ('Yes', 100) bin_tags = set( ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has']) question = question.lower() sent = sent.lower() q_tokens = word_tokenize(question) s_tokens = word_tokenize(sent) negations = set(['not', 'never', "aren't"]) ans = '' # case 1: negations for neg in negations: if (neg in q_tokens) and (neg not in s_tokens): if ans == "No": ans = "Yes" else: ans = "No" if (neg in q_tokens) and (neg in s_tokens): if ans == "Yes": ans = "No" else: ans = "Yes" # case 2: similarity sim = fuzz.partial_ratio(question, sent) if sim > 90: ans = "Yes" else: ans = "No" return (ans, sim) def qstType(self, qst): self.thisType = 'UK' self.qstFlag = False self.qstSim = None tree = self.sNLP.parser_sents([ qst, ]) for i in tree: self.decideType(i) def fitness(self, txt, qst): self.qstType(qst) if self.thisType == 'UK': _, sim = self.bin_answer(qst, txt) return sim > self.threshold qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in [txt]: extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_ans = '_' best_candi = None best_sen = None for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) score = fuzz.partial_ratio(self.qstSim, nowSentence) this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue best_dis = score best_sen = nowSentence best_ans = this_ans return self.threshold < best_dis def dropFragment(self, myParent, qstType): flag = 0 for node in myParent: if isinstance(node, str): continue if self.dropTime > self.dropTotal: return if node.label() in self.dropType[qstType]: self.dropTime += 1 if self.dropTime > self.dropTotal: myParent.remove(node) self.candidateAnswer.append(node.leaves()) self.findFlag = 1 return self.dropFragment(node, qstType) if node.label() == 'ROOT' and self.findFlag: # print(node.leaves()) self.candidateSentence.append(node.leaves()) def findFragment(self, myParent, qstType): for node in myParent: if isinstance(node, str): continue # node.pretty_print() if node.label() in self.dropType[qstType]: self.candidateAnswer.append((node.leaves(), node.label())) self.findFragment(node, qstType) def answerSpecial(self, txtList, tokens, qstType): # print(tokens[0]) self.candidateAnswer = [] self.finalAnswer = [] self.candidateSentence = [] for txt in txtList: tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.findFragment(i, qstType) for i in self.candidateAnswer: sentence = ' '.join(i[0]) pos_tag = self.sNLP.ner(sentence) print(pos_tag) if pos_tag[1][1] in self.typeNer[qstType]: # print(pos_tag) self.finalAnswer.append(sentence) print(self.finalAnswer[0]) def preProcessText(self, text): data = re.sub("\(.*\)", "", text) data = re.sub(' +', ' ', data).strip() return data def answer(self, txtList, qst): self.head = word_tokenize(qst)[0].lower() self.qstType(qst) if self.thisType == 'UK': best_score = 0 best_ans = 'Yes' best_sent = '_' for txt in txtList: ans, sim = self.bin_answer(qst, txt) if sim > best_score: best_ans = ans best_score = sim best_sent = txt #print('=======') #print(best_sent) #print(qst) print(best_ans + '.') #print(best_score) #print('=======') return qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in txtList: thisSent = self.preProcessText(thisSent) if (len(word_tokenize(thisSent)) < 4 or len(word_tokenize(thisSent)) > 25): continue extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_candi = None best_sen = None best_ans = '_' for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) # print(nowSentence) # print(self.qstSim) score = fuzz.partial_ratio(self.qstSim, nowSentence) # print(score) # print('----------') this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue if self.head == 'who': ners = getExhaustiveNERs(this_ans) #print(this_ans, ners[0]) if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[ 0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'when': ners = getExhaustiveNERs(this_ans) if 'DATE' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'where': ners = getExhaustiveNERs(this_ans) if 'LOCATION' not in ners[0] and 'CITY' not in ners[ 0] and 'ORGANIZATION' not in ners[ 0] and 'STATE_OR_PROVINCE' not in ners[ 0] and 'COUNTRY' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 best_dis = score best_sen = nowSentence best_ans = this_ans #print('++++++++++++++++++') #print(qst) #print(best_dis) #print(best_sen) if best_ans == '_': print('I cannot answer that question: ' + qst) else: print(best_ans.capitalize() + '.') #print('++++++++++++++++++') def edit_distance(self, s1, s2): if len(s1) < len(s2): return self.edit_distance(s2, s1) # len(s1) >= len(s2) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): c1 = c1.lower() current_row = [i + 1] for j, c2 in enumerate(s2): c2 = c2.lower() insertions = previous_row[ j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer deletions = current_row[j] + 1 # than s2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] for i in range(len(self.candidateSentence)): nowSentence = self.candidateSentence[i] score = self.edit_distance(nowSentence, tokens) best_candi = ' '.join(nowSentence) this_ans = ' '.join(self.candidateAnswer[i]) if (score < best_dis or (score == best_dis and len(this_ans) < len(best_ans))): best_dis = score best_ans = this_ans return best_dis
def __init__(self): self.sNLP = StanfordNLP() self.sent_simpl = Simplification() self.QG = QuestionGeneration()
def __init__(self): self.sNLP = StanfordNLP()
class Simplification: ##### Split on conjunctions and some more simplifications in the complex parent sentence def __init__(self): self.sNLP = StanfordNLP() def coref_resolve(self, passage): text = "John is a tennis player. He has an awesome physique." result = json.loads(self.sNLP.coref(text)) # print() # print(result['sentences']) num, mentions = list(result['corefs'].items())[0] def splitConjunctions(self, parent, list_sents, firstPP): firstFlag = 0 for node in parent: if type(node) is ParentedTree: if node.label() == 'ROOT': pass else: # print("Label:", node.label()) # print("Leaves:", node.leaves()) # print("Parent:", node.parent().label()) # print(node.left_sibling()) # print(node.label()) # print(node.right_sibling()) # print(node.parent().label()) # print(len(node)) # if the starting of the sentence is with a Prepositional phrase move it to the last child of each verb phrase seen if node.label() == "PP" and node.parent().label( ) == "S" and node.left_sibling() is None: parent.remove(node) firstPP = node # make a copy firstFlag = 1 # print("firstPP parent", firstPP.parent()) # print(firstPP) # for each VP insert the extracted PP as the last child elif node.label( ) == "VP" and firstPP is not None and node.parent().label( ) == 'S': # print("Im inside firstPP") # node.pretty_print() # firstPP.pretty_print() # print(firstPP.parent()) # if node.right_sibling().label() == '.' or node.right_sibling().label() == 'CC': node.insert(len(node), firstPP) # node.pretty_print() #### split on conjunctions iff left and right siblings of CC except (or, nor) are S if node.label() == "CC" and node.leaves() not in ('or', 'Or', 'nor', 'Nor') and \ node.left_sibling() is not None and node.right_sibling() is not None: # print("Im here") # print(node.left_sibling().label()) # print(node.right_sibling().label()) if node.left_sibling().label() == "S": list_sents.append(node.left_sibling()) if node.right_sibling().label() == "S": list_sents.append(node.right_sibling()) if node.parent() is not None: if firstFlag: firstPP_temp = firstPP.copy( deep=True ) # maintain a copy of the first PP found through out the recursion else: firstPP_temp = None self.splitConjunctions(node, list_sents, firstPP_temp) return list_sents ####### traverse and simplify sentence def traversalAndSimplification(self, parent): for node in parent: if type(node) is ParentedTree: if node == None: continue if node.label() == 'ROOT': pass else: if node.label() in ("ADVP", "SBAR", "SBARQ"): parent.remove(node) elif node.label() == 'PP' and node.parent().label( ) == "VP" and node.left_sibling().label() == ',': parent.remove(node) elif node.parent().label() == 'NP' and node.left_sibling( ) is not None and node.right_sibling() is not None: if node.left_sibling().label( ) == ',' and node.right_sibling().label() == ',': parent.remove(node) # print(parent.leaves()) if node.parent( ) is not None: ### recursive to go in depth of the tree self.traversalAndSimplification(node) # else: # print("Word:", node)pass return (parent.leaves())
def __init__(self): self.sNLP = StanfordNLP() self.beVerbs = {"am", "is", "are", "was", "were"}
class QuestionGeneration: def __init__(self): self.sNLP = StanfordNLP() self.beVerbs = {"am", "is", "are", "was", "were"} # self.aux_verbs = {'is', 'were', 'can', 'could', } def auxilaryWord(self, sub, POS_tag): # TODO lowercase # TODO will may... # TODO plural... # Jerry and I if sub.lower() in ('i', 'they', 'you'): return 'do' if sub.lower() in ('he', 'she'): return 'does' def beWork(self, sentence): # pos = nltk.pos_tag(sentence) j = None for i in range(len(sentence) - 1): if sentence[i] in self.beVerbs: j = i break if j is not None: temp = sentence[j] sentence.pop(j) sentence.insert(0, temp) #print(sentence) return sentence return # def getNounandVerbOfSentence(self, sentence): def QG(self, text): dep_parse_Tree = self.sNLP.dependency_parse(text) dep_parse_Tree = dep_parse_Tree.__next__() Ques_list = [] # Yes or No question be_question = self.beWork(text) if be_question is not None: be_question += '?' Ques_list.append(be_question) # WHO question for Subject # create NER tags ner_tags = dict(self.sNLP.ner(text)) pos_tag = self.sNLP.pos(text) #print(ner_tags) # get triples list of the dependency tree triples_list = list(dep_parse_Tree.triples()) #print(triples_list) ##### LOOP THRU DEPENDENCY TREE AND CREATE QUESTIONS auxWord = 'xxx' for this in triples_list: # print(this) temp_text = '?' # for the subject question if this[1] in ['nsubj', 'csubj', 'nsubjpass']: subject = None sub_pos = None # in order of preference if this[2][1] in ['NNP', 'NNPS', 'PRP']: subject = this[2][0] sub_pos = this[2][1] elif this[0][1] in ['NNP', 'NNPS']: subject = this[0][0] sub_pos = this[0][1] elif this[2][1] in ['NN', 'NNS']: subject = this[2][0] sub_pos = this[2][1] #print("sub", subject) if subject is not None: # need to add sub_pos auxWord = self.auxilaryWord(subject, sub_pos) if ner_tags[subject] in ['PERSON', 'TITLE', 'MISC' ]: # check if its a PERSON NER temp_text = self.contructQ(triples_list, subject, text, None) temp_text = temp_text.replace(subject, "Who").replace( " .", "?") # create question # some string manipulation to get the ? if "?" not in temp_text: temp_text = temp_text + "?" # print(text.replace(subject, "Who").replace(" .", "?")) if ner_tags[ subject] == 'ORGANIZATION': # if the subject is ORG temp_text = text.replace(subject, "Which organization").replace( " .", "?") if ner_tags[subject] == 'CITY': # if the subject is CITY temp_text = text.replace(subject, "Which city").replace( " .", "?") if ner_tags[ subject] == 'COUNTRY': # if the subject is CITY temp_text = text.replace(subject, "Which country").replace( " .", "?") if this[2][1] in ['PRP']: # if the subject is preposition temp_text = text.replace(subject, "Who").replace(" .", "?") if ner_tags[subject] in [ 'O', 'LOCATION' ] and temp_text == '?': # if the subject is Other temp_text = self.contructQ(triples_list, subject, text, None) if sub_pos == 'PRP' and subject.lower() in [ 'they', 'he', 'she' ]: temp_text = temp_text.replace(subject, "Who").replace( " .", "?") else: temp_text = temp_text.replace(subject, "What").replace( " .", "?") # for number, How many questions elif this[1] in ['nummod']: numPhrase = this[2][0] + ' ' + this[0][0] targetWord = this[2][0] if ner_tags[targetWord] in ('NUMBERS'): temp_text = text.replace(numPhrase, "").replace(" .", "?") temp_text = "How many " + this[0][0] + " " + ( auxWord if auxWord is not None else "") + " " + temp_text # for possessive questions elif this[1] in ['nmod:poss']: if this[2][1] in ['NNP']: # if this[2][0][-1] == 's': # poss_word = this[2][0] # else: poss_word = this[2][0] #+ " 's" temp_text = self.contructQ(triples_list, this[2][0], text, None) temp_text = temp_text.replace(poss_word, "Whose").replace( " .", "?").replace("'s", "").replace(" '", "") if not temp_text.startswith("Whose"): temp_text = temp_text.replace("Whose", "whose").replace( " '", "") # for prop questions elif this[1] in ('case'): subject = this[0][0] propPhrase = this[2][0] + ' ' + this[0][0] # print(propPhrase) if ner_tags[subject] in ['CITY']: # where temp_text = text.replace(propPhrase, "").replace( " .", "?") # create question temp_text = "Where " + (auxWord if auxWord is not None else "") + " " + temp_text # some string manipulation to get the ? if ner_tags[subject] in ['DATE']: # when temp_text = text.replace(propPhrase, "").replace(" .", "?") # print(auxWord, temp_text) temp_text = "When " + (auxWord if auxWord is not None else "") + " " + temp_text elif this[1] in ('iobj', 'dobj'): # code to be written for questions on direct and indirect Objects pass #### endif if "?" not in temp_text: temp_text = temp_text + "?" if temp_text != '?': # print(temp_text) Ques_list.append(temp_text) return (Ques_list) #### in case of the subject has modifiers or the Subject is a part of a long NP remove all the related modifiers of the subject with the help of dependency tree #### same to be replicated for Object as well def contructQ(self, list_triples, subject, text, object): if subject is not None: text = text[text.find( subject ):] ## removing unnecessary determinants (a, the, An) by slicing off until the subject word # print(text) dict_of_words_removed = { } # subject related word removal to construct a question for thisTriple in list_triples: ## loop thru dependency tree if thisTriple[0][0] == subject or thisTriple[0][ 0] in dict_of_words_removed: if thisTriple[1] not in ['nsubj', 'csubj']: if (thisTriple[2][0]).lower() not in [ 'the', 'a', 'an' ]: # skipping determinants as they can be present in other places of the sentence as well text = re.sub( ' +', ' ', text.replace(thisTriple[2][0], '')).strip( ) # removing subject related words dict_of_words_removed[thisTriple[2][ 0]] = 0 # adding the removed word so that other words that are connected to this can also be removed return (text)
def getNerSet(phrase): sNLP = StanfordNLP() return {t[1] for t in sNLP.ner(phrase)}
def bin_question(sents): # preprocessing # text_file = sys.argv[1] # sentences = [] # with io.open(text_file, 'r', encoding='utf-8') as f: # for line in f: # line = line.strip() # sentences.extend(sent_tokenize(line)) # # tagging # tokenized_sentences = [word_tokenize(i) for i in sentences if # (len(word_tokenize(i)) > 5) and (len(word_tokenize(i)) < 25)] sNLP = StanfordNLP() parse = sNLP.parse(sents) sents = What_Who_QG.remove_modifiers(parse) # print("remove modifiers", sents) tokenized_sentences = [] tokenized_sentences.append(word_tokenize(sents)) # print("TOKE", tokenized_sentences) aux_words = ['are', 'was', 'were', 'is', 'have', 'has'] aux_words = set(aux_words) question_set = [] # c = 0 for sent in tokenized_sentences: pos_tags = nltk.pos_tag(sent) # print(pos_tags) if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'): pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1]) q_list = copy.deepcopy(pos_tags) q_string = '' for i in range(len(pos_tags)): if pos_tags[i][0] in aux_words: q_list.insert(0, q_list.pop(i)) break elif pos_tags[i][1] == 'VBD': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD') q_list.insert(0, ('Did', 0)) break elif pos_tags[i][1] == 'VBZ': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ") q_list.insert(0, ("Does", 0)) # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v') break elif pos_tags[i][1] == 'VBP': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBP") q_list.insert(0, ("Do", 0)) # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v') break if q_list[0][0].lower() in [ 'are', 'was', 'were', 'is', 'have', 'has', 'did', 'do', 'does' ]: replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:] q_list[0] = (replace_string, 0) question = ' '.join([i[0] for i in q_list]) question = question[:-2] question = question + "?" question_set.append(question) # print(question_set) return question_set
# for i in dep_tree: # if i[1] in ['nsubj', 'csubj', 'nsubjpass']: # return(i[0][0], i[0][1]) # return (None,None) # def findAuxVerb(dep_tree, verb): # aux = "" # mod = "" # for i in dep_tree: # if i[0][0] == verb and i[1] in ["auxpass", "aux"]: # aux += i[2][0]+" " # if i[0][0] == verb and i[1] in ["adv", "advmod"]: # mod += i[2][0] + " " # return (aux, mod, verb) sNLP = StanfordNLP() def getDecapitalized(sentence): tokens = sNLP.word_tokenize(sentence) first = tokens[0] # print(first) thisNER = sNLP.ner(sentence) # print(thisNER) if thisNER[0][1] not in [ 'PERSON', 'LOCATION', 'ORGANIZATION', 'CITY', 'NATIONALITY', 'COUNTRY', 'TIME' ]: first = first.lower() return first + " " + " ".join(tokens[1:])