예제 #1
0
    def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine):
        input_phrase = interpreted_phrase.interpretation
        text_utils = answering_engine.text_utils
        tokens = text_utils.tokenizer.tokenize(input_phrase)
        tagsets = list(text_utils.postagger.tag(tokens))
        lemmas = text_utils.lemmatizer.lemmatize(tagsets)

        #edges = syntan.parse(tokens, tagsets)
        # заглушка вместо парсинга:
        edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)]

        phrase_tokens = []
        for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = text_utils.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        for mask in self.masks:
            mx = match(phrase_tokens, mask.mask_terms)
            if mx:
                #print('{} groups in matching:'.format(mx.groups_count()))
                res = RuleConditionMatching.create(True)
                for group_name, tokens in mx.index2group.items():
                    normal_words = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags)
                    #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words)))
                    res.add_group(group_name.upper(), normal_words, tokens)
                return res

        return RuleConditionMatching.create(False)
예제 #2
0
    def extract_chunks(self, sample):
        tokens = self.tokenizer.tokenize(sample)
        tagsets = list(self.postagger.tag(tokens))
        lemmas = self.lemmatizer.lemmatize(tagsets)
        #edges = syntan.parse(tokens, tagsets)

        phrase_tokens = []
        for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = self.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        return chunks
예제 #3
0
    def check_condition(self, bot, session, interlocutor, interpreted_phrase,
                        answering_engine):
        if self.is_raw:
            input_phrase = interpreted_phrase.raw_phrase
        else:
            input_phrase = interpreted_phrase.interpretation

        text_utils = answering_engine.text_utils
        tokens = text_utils.tokenizer.tokenize(input_phrase)
        tagsets = list(text_utils.postagger.tag(tokens))
        lemmas = text_utils.lemmatizer.lemmatize(tagsets)

        #edges = syntan.parse(tokens, tagsets)
        # заглушка вместо парсинга:
        edges = [(word, iword, None, None, None)
                 for (iword, word) in enumerate(tokens)]

        phrase_tokens = []
        for word_index, (token, tagset,
                         lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = text_utils.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        for mask in self.masks:
            mx = match(phrase_tokens, mask.mask_terms)
            if mx:
                #print('{} groups in matching:'.format(mx.groups_count()))
                res = RuleConditionMatching.create(True)
                for group_name, tokens in mx.index2group.items():
                    normal_words1 = normalize_chunk(tokens, edges,
                                                    text_utils.flexer,
                                                    text_utils.word2tags)
                    normal_words2 = normalize_chunk(
                        tokens,
                        edges,
                        text_utils.flexer,
                        text_utils.word2tags,
                        target_tags={'ЧИСЛО': 'ЕД'})
                    normal_words = list(
                        set(normal_words1) | set(normal_words2))

                    if group_name in self.constraints_w2v:
                        constraints_satisfied = True
                        for c in self.constraints_w2v[group_name]:
                            hit = False
                            for chunk_word in normal_words:
                                sim = text_utils.word_similarity(
                                    c.anchor, chunk_word)
                                if sim >= c.sim:
                                    hit = True
                                    break
                            if not hit:
                                constraints_satisfied = False
                                break

                        if not constraints_satisfied:
                            return RuleConditionMatching.create(False)

                    #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words)))
                    res.add_group(group_name.upper(), normal_words, tokens)

                return res

        return RuleConditionMatching.create(False)
예제 #4
0
    def do_action(self, bot, session, interlocutor, interpreted_phrase,
                  condition_matching_results, text_utils):
        if self.np_sources:
            if condition_matching_results is None:
                condition_matching_results = RuleConditionMatching.create(True)

            for np, question in self.np_sources.items():
                if bot.get_engine().does_bot_know_answer(
                        question, bot, session, interlocutor):
                    interpreted_phrase2 = InterpretedPhrase(question)
                    answers = bot.get_engine().build_answers(
                        session, bot, interlocutor, interpreted_phrase2)
                    if answers:
                        answer = answers[0]
                        tokens = text_utils.tokenize(answer)
                        tagsets = list(text_utils.postagger.tag(tokens))
                        lemmas = text_utils.lemmatizer.lemmatize(tagsets)

                        phrase_tokens = []
                        for word_index, (token, tagset, lemma) in enumerate(
                                zip(tokens, tagsets, lemmas)):
                            t = PhraseToken()
                            t.word = token
                            t.norm_word = token.lower()
                            t.lemma = lemma[2]
                            t.tagset = tagset[1]
                            t.word_index = word_index
                            phrase_tokens.append(t)

                        condition_matching_results.add_group(
                            np, tokens, phrase_tokens)
                    else:
                        return None

        # Сначала попробуем убрать из списка те реплики, которые мы уже произносили.
        new_utterances = []
        for utterance0 in self.phrases:
            utterance = self.prepare4saying(utterance0,
                                            condition_matching_results,
                                            text_utils)

            if '$' in utterance:
                # Не удалось подставить значение в один из $-слотов, значит
                # надо исключить фразу.
                continue

            if session.count_bot_phrase(utterance) == 0:
                if self.known_answer_policy == 'skip' and utterance[-1] == '?':
                    # Проверим, что бот еще не знает ответ на этот вопрос:
                    if bot.does_bot_know_answer(utterance, session,
                                                interlocutor):
                        continue

                new_utterances.append(utterance)

        uttered = False
        if len(new_utterances) > 0:
            # Выбираем одну из оставшихся фраз.
            if len(new_utterances) == 1:
                bot.say(session, new_utterances[0])
            else:
                bot.say(session, random.choice(new_utterances))
            uttered = True
        else:
            # Все фразы бот уже произнес
            # Если задан список фраз на случай исчерпания (типа "не знаю больше ничего про кошек"),
            # то выдадим одну из них.

            new_utterances = []
            for utterance0 in self.exhausted_phrases:
                utterance = self.prepare4saying(utterance0,
                                                condition_matching_results,
                                                text_utils)

                if '$' in utterance:
                    # Не удалось подставить значение в один из $-слотов, значит
                    # надо исключить фразу.
                    continue

                if session.count_bot_phrase(utterance) == 0:
                    if self.known_answer_policy == 'skip' and utterance[
                            -1] == '?':
                        # Проверим, что бот еще не знает ответ на этот вопрос:
                        if bot.does_bot_know_answer(utterance, session,
                                                    interlocutor):
                            continue

                    new_utterances.append(utterance)

            if new_utterances:
                bot.say(session, random.choice(new_utterances))
                uttered = True
            else:
                if self.known_answer_policy == 'skip':
                    pass
                else:
                    # Начиная с этого момента данное правило будет повторно выдавать
                    # одну из фраз.
                    #for src_phrase in sorted(self.phrases, key=lambda z: random.random()):
                    #    random_phrase = self.prepare4saying(src_phrase, condition_matching_results, text_utils)
                    #    if '$' not in random_phrase:
                    #        bot.say(session, random_phrase)
                    #        uttered = True
                    #        break
                    uttered = False

        return uttered