Пример #1
0
def best_synset(word_str, pos_tag='n'):
    if isinstance(word_str, Token):
        word_str, pos_tag = word_str.text.lower(), word_str.pos_
    assert isinstance(word_str, str)
    assert isinstance(pos_tag, str)

    lemma = lemmatize(word_str.lower())
    if lemma:
        lemma = lemma[0]
    tag = to_wordnet_tag(pos_tag)

    try:
        if lemma and pos_tag:
            synset = wn.synset('{}.{}.{}'.format(lemma, tag, '01'))
            if synset:
                return synset
            raise WordNetError
    except WordNetError:
        try:
            lemmas = wn.lemmas(lemma)
            if lemmas:
                synset = lemmas[0].synset()
                if synset:
                    return synset
            raise WordNetError
        except WordNetError:
            pass
Пример #2
0
def get_sentence_with_answer(story, answer):
    if isinstance(answer, list):
        answer = max(answer, key=lambda x: len(x.split()))

    sentences = sent_tokenize(story)
    answer = lemmatize(answer)
    has_answer = []
    for sentence in sentences:
        sentence_lemmas = lemmatize(sentence)
        if set([a.lower()
                for a in answer]) <= set([s.lower() for s in sentence_lemmas]):
            has_answer.append(sentence)

    if len(has_answer) == 1:
        return has_answer[0]
    elif len(has_answer) > 1:
        return max(has_answer, key=lambda x: calculate_overlap(x, answer))
Пример #3
0
def get_phrase_for_what(raw_question, raw_sentence):
    q_root = get_spacy_dep_parse(raw_question)
    aux = lemmatize([
        to_sentence(token) for token in q_root
        if token.dep_ in ['aux', 'auxpass']
    ])
    do_result = -1
    be_result = -1
    result = None
    if aux:
        if len(aux) == 1:
            if aux[0] == 'do':
                result = get_phrase_for_what_do(raw_question, raw_sentence)
            elif aux[0] == 'be':
                result = get_phrase_for_what_be(raw_question, raw_sentence)
        else:
            big_aux = max(aux, key=lambda x: len(x))
            if 'do' in big_aux:
                result = get_phrase_for_what_do(raw_question, raw_sentence)
            elif 'be' in big_aux:
                result = get_phrase_for_what_be(raw_question, raw_sentence)

        # if isinstance(do_result, str):
        #     return do_result
        # elif isinstance(be_result, str):
        #     return be_result
        if result:
            return result

    lemmatized = lemmatize(raw_question)
    if 'do' in lemmatized:
        result = get_phrase_for_what_do(raw_question, raw_sentence)
    elif 'be' in lemmatized:
        result = get_phrase_for_what_be(raw_question, raw_sentence)

    if result:
        return result

    return get_phrase_for_what_do(raw_question, raw_sentence)
Пример #4
0
def get_top_ner_chunk_of_each_tag(sentence,
                                  accepted_tags=("PERSON", "GPE",
                                                 "ORGANIZATION")):
    named_question_chunks = text_analyzer.squash_with_ne(
        nltk.ne_chunk(nltk.pos_tag(text_analyzer.lemmatize(sentence)),
                      binary=False))
    top_chunks = {}
    for tag in accepted_tags:
        question_chunks = [
            x.split() for x in text_analyzer.get_contiguous_x_phrases(
                named_question_chunks, tag)
        ]
        if question_chunks:
            top_question_chunk = max(question_chunks, key=lambda x: len(x))
            if len(top_question_chunk) > 0:
                top_chunks[tag] = top_question_chunk
    return top_chunks
Пример #5
0
    def __init__(self, raw_question):
        assert isinstance(raw_question, str)

        self.doc = get_spacy_dep_parse(raw_question)
        self.root = list(self.doc.sents)[0].root
        self.root_synset = best_synset(self.root.text, 'v')

        self.qword = ''
        self.subjects = []
        self.objects = []
        # self.conjunctions = []
        self.auxiliaries = []
        self.prepositions = []

        # for child in self.root.children:
        #     if child.text.lower() in self.QUESTION_WORDS:
        #         if self.qword == '':
        #             self.qword = child.text.lower()
        #         continue
        #
        #     dependency = child.dep_
        #     if dependency in self.SUBJECTS:
        #         self.subjects.append(child)
        #     elif dependency in self.OBJECTS:
        #         self.objects.append(child)
        #     elif dependency in self.CONJUNCTIONS:
        #         self.conjunctions.append(child)
        #     elif dependency in self.AUXILIARIES:
        #         self.auxiliaries.append(child)
        #     elif dependency in self.PREPOSITIONS:
        #         self.prepositions.append(child)
        #
        # if self.conjunctions:
        #     for complement in self.conjunctions:
        #         for child in complement.children:
        #             if child.text.lower() in self.QUESTION_WORDS:
        #                 if self.qword == '':
        #                     self.qword = child.text.lower()
        #                 continue
        #
        #             dependency = child.dep_
        #             if dependency in self.SUBJECTS:
        #                 self.subjects.append(child)
        #             elif dependency in self.OBJECTS:
        #                 self.objects.append(child)
        #             elif dependency in self.CONJUNCTIONS:
        #                 self.objects.append(child)
        #             elif dependency in self.AUXILIARIES:
        #                 self.auxiliaries.append(child)
        #             elif dependency in self.PREPOSITIONS:
        #                 self.prepositions.append(child)

        pool = Queue()
        for child in self.root.children:
            pool.put(child)

        while not pool.empty():
            head = pool.get()

            if head.text.lower() in self.QUESTION_WORDS:
                if self.qword == '':
                    self.qword = head.text.lower()
                continue

            dependency = head.dep_
            if dependency in self.SUBJECTS:
                self.subjects.append(head)
            elif dependency in self.OBJECTS:
                self.objects.append(head)
            elif dependency in self.AUXILIARIES:
                self.auxiliaries.append(head)
            elif dependency in self.PREPOSITIONS:
                self.prepositions.append(head)

            if dependency in self.CONJUNCTIONS:
                for child in head.children:
                    pool.put(child)

        # qword may not have been an immediate dependent of head
        if not self.qword:
            for child in self.root.subtree:
                if child.text.lower() in self.QUESTION_WORDS:
                    self.qword = child.text.lower()

        self._wants_dep = None
        self._wants_pos = None
        self._wants_wordnet = None
        self._aux_lemmas = [
            lemmatize(aux.text.lower())[0] for aux in self.auxiliaries
        ]