def best_synset(word_str, pos_tag='n'): if isinstance(word_str, Token): word_str, pos_tag = word_str.text.lower(), word_str.pos_ assert isinstance(word_str, str) assert isinstance(pos_tag, str) lemma = lemmatize(word_str.lower()) if lemma: lemma = lemma[0] tag = to_wordnet_tag(pos_tag) try: if lemma and pos_tag: synset = wn.synset('{}.{}.{}'.format(lemma, tag, '01')) if synset: return synset raise WordNetError except WordNetError: try: lemmas = wn.lemmas(lemma) if lemmas: synset = lemmas[0].synset() if synset: return synset raise WordNetError except WordNetError: pass
def get_sentence_with_answer(story, answer): if isinstance(answer, list): answer = max(answer, key=lambda x: len(x.split())) sentences = sent_tokenize(story) answer = lemmatize(answer) has_answer = [] for sentence in sentences: sentence_lemmas = lemmatize(sentence) if set([a.lower() for a in answer]) <= set([s.lower() for s in sentence_lemmas]): has_answer.append(sentence) if len(has_answer) == 1: return has_answer[0] elif len(has_answer) > 1: return max(has_answer, key=lambda x: calculate_overlap(x, answer))
def get_phrase_for_what(raw_question, raw_sentence): q_root = get_spacy_dep_parse(raw_question) aux = lemmatize([ to_sentence(token) for token in q_root if token.dep_ in ['aux', 'auxpass'] ]) do_result = -1 be_result = -1 result = None if aux: if len(aux) == 1: if aux[0] == 'do': result = get_phrase_for_what_do(raw_question, raw_sentence) elif aux[0] == 'be': result = get_phrase_for_what_be(raw_question, raw_sentence) else: big_aux = max(aux, key=lambda x: len(x)) if 'do' in big_aux: result = get_phrase_for_what_do(raw_question, raw_sentence) elif 'be' in big_aux: result = get_phrase_for_what_be(raw_question, raw_sentence) # if isinstance(do_result, str): # return do_result # elif isinstance(be_result, str): # return be_result if result: return result lemmatized = lemmatize(raw_question) if 'do' in lemmatized: result = get_phrase_for_what_do(raw_question, raw_sentence) elif 'be' in lemmatized: result = get_phrase_for_what_be(raw_question, raw_sentence) if result: return result return get_phrase_for_what_do(raw_question, raw_sentence)
def get_top_ner_chunk_of_each_tag(sentence, accepted_tags=("PERSON", "GPE", "ORGANIZATION")): named_question_chunks = text_analyzer.squash_with_ne( nltk.ne_chunk(nltk.pos_tag(text_analyzer.lemmatize(sentence)), binary=False)) top_chunks = {} for tag in accepted_tags: question_chunks = [ x.split() for x in text_analyzer.get_contiguous_x_phrases( named_question_chunks, tag) ] if question_chunks: top_question_chunk = max(question_chunks, key=lambda x: len(x)) if len(top_question_chunk) > 0: top_chunks[tag] = top_question_chunk return top_chunks
def __init__(self, raw_question): assert isinstance(raw_question, str) self.doc = get_spacy_dep_parse(raw_question) self.root = list(self.doc.sents)[0].root self.root_synset = best_synset(self.root.text, 'v') self.qword = '' self.subjects = [] self.objects = [] # self.conjunctions = [] self.auxiliaries = [] self.prepositions = [] # for child in self.root.children: # if child.text.lower() in self.QUESTION_WORDS: # if self.qword == '': # self.qword = child.text.lower() # continue # # dependency = child.dep_ # if dependency in self.SUBJECTS: # self.subjects.append(child) # elif dependency in self.OBJECTS: # self.objects.append(child) # elif dependency in self.CONJUNCTIONS: # self.conjunctions.append(child) # elif dependency in self.AUXILIARIES: # self.auxiliaries.append(child) # elif dependency in self.PREPOSITIONS: # self.prepositions.append(child) # # if self.conjunctions: # for complement in self.conjunctions: # for child in complement.children: # if child.text.lower() in self.QUESTION_WORDS: # if self.qword == '': # self.qword = child.text.lower() # continue # # dependency = child.dep_ # if dependency in self.SUBJECTS: # self.subjects.append(child) # elif dependency in self.OBJECTS: # self.objects.append(child) # elif dependency in self.CONJUNCTIONS: # self.objects.append(child) # elif dependency in self.AUXILIARIES: # self.auxiliaries.append(child) # elif dependency in self.PREPOSITIONS: # self.prepositions.append(child) pool = Queue() for child in self.root.children: pool.put(child) while not pool.empty(): head = pool.get() if head.text.lower() in self.QUESTION_WORDS: if self.qword == '': self.qword = head.text.lower() continue dependency = head.dep_ if dependency in self.SUBJECTS: self.subjects.append(head) elif dependency in self.OBJECTS: self.objects.append(head) elif dependency in self.AUXILIARIES: self.auxiliaries.append(head) elif dependency in self.PREPOSITIONS: self.prepositions.append(head) if dependency in self.CONJUNCTIONS: for child in head.children: pool.put(child) # qword may not have been an immediate dependent of head if not self.qword: for child in self.root.subtree: if child.text.lower() in self.QUESTION_WORDS: self.qword = child.text.lower() self._wants_dep = None self._wants_pos = None self._wants_wordnet = None self._aux_lemmas = [ lemmatize(aux.text.lower())[0] for aux in self.auxiliaries ]