Python PreProcessor.get_keywords 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: src.data.preprocessor

클래스/타입: PreProcessor

메소드/함수: get_keywords

hotexamples.com에서의 예제들: 3

Python PreProcessor.get_keywords - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 src.data.preprocessor.PreProcessor.get_keywords에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PreProcessor(8)

clean(5)

create_InputFeature(3)

get_keywords(3)

get_morphs(2)

idx_to_orig(2)

str_to_tokens(2)

예제 #1

파일 보기

파일: query.py 프로젝트: taeuk-gang/chatbot-backend

class QueryMaker(object):
    def __init__(self):

        self.preprocessor = PreProcessor()
        self.modelWrapper = TensorServer()
        self._question_maker = QuestionMaker()
        self._service_shuttle = ShuttleBus()
        self._service_search = Search()

        self.CONFIG = config.QUERY

    def by_category(self, chat, category, matched_question=None):

        if category == 'shuttle_bus':
            return self._service_shuttle.response()
        elif category == 'talk' or category == 'prepared':
            return {"mode": category, "answer": matched_question.answer}
        elif category == 'food':
            return {'mode': 'food', 'answer': '학식 보여주기'}
        elif category == 'book':
            return {'mode': 'book', 'answer': '도서관 모드 진입'}
        elif category == 'search':
            answer, output = self._service_search.response(chat)
            if not answer:  # 정답이 오지 않았다면 실패
                return {'mode': 'unknown', 'answer': '무슨 말인지 모르겠다냥~ 다시 해달라냥'}
            return {'mode': 'search', 'answer': answer, 'output': output}

    def make_query(self, chat, added_time=None, analysis=False):

        chat, removed = self.preprocessor.clean(chat)

        if chat is '' or chat is None:
            return None

        if not added_time:
            added_time = datetime.utcnow().astimezone(UTC)

        added_time.astimezone(UTC)

        def get_top(distances, measure='jaccard'):
            if not distances:
                return None
            assert type(distances) is OrderedDict
            output = {}

            for n, each in enumerate(list(distances.items())):
                item = each[0]
                distance = each[1]
                if distance >= self.CONFIG[
                        'jaccard_threshold'] and measure == 'jaccard':
                    question_matched = questions.find_by_text(item)
                    output[n] = (question_matched, distance)
                if distance >= self.CONFIG[
                        'cosine_threshold'] and measure == 'cosine':
                    question_matched = questions.find_by_text(item)
                    output[n] = (question_matched, distance)
                # question_matched = questions.find_by_text(item)
                # output[n] = (question_matched, distance)

            if len(output) == 0:
                return None

            return output

        feature_vector = self.modelWrapper.similarity(chat)
        jaccard_similarity = None
        top_feature_distance = None
        category = None
        keywords = self.preprocessor.get_keywords(chat)
        morphs = self.preprocessor.get_morphs(chat)

        # 우선 자카드 유사도 TOP 5를 찾음
        jaccard_top_distances = get_top(self.get_jaccard(chat),
                                        measure='jaccard')

        if jaccard_top_distances and not analysis:
            measurement = '자카드 유사도'
            matched_question, jaccard_similarity = jaccard_top_distances[0][
                0], jaccard_top_distances[0][1]
            category = matched_question.category

        else:  # 자카드 유사도가 없다면, 유클리드 또는 맨하탄 거리 비교로 넘어간다.
            feature_top_distances = get_top(self.get_similarity(
                chat, keywords, analysis),
                                            measure='cosine')
            if analysis:
                return feature_top_distances
            measurement = self.CONFIG['distance']
            if feature_top_distances is None:
                category = 'search'
                matched_question = None
                top_feature_distance = None
            else:
                matched_question = feature_top_distances[0][0]
                top_feature_distance = feature_top_distances[0][1]
                category = matched_question.category

        answer = self.by_category(chat, category, matched_question)

        query = Query(chat=chat,
                      feature_vector=feature_vector,
                      keywords=keywords,
                      matched_question=matched_question,
                      manhattan_similarity=top_feature_distance,
                      jaccard_similarity=jaccard_similarity,
                      added_time=added_time,
                      answer=answer,
                      morphs=morphs,
                      measurement=measurement,
                      category=category)

        return query

    def get_jaccard(self, chat):
        assert chat is not None
        question_list = questions.find_all()
        assert question_list is not None

        distance_dict = {}

        def _calc_jaacard(A, B):
            A_output = A['text']
            B_output = B['text']
            VISITED = []
            num_union = len(A) + len(B) - 2  # output 뺀 것
            num_joint = 0
            for key_a, tag_a in A.items():
                for key_b, tag_b in B.items():
                    if key_a == 'text' or key_b == 'text':
                        continue
                    if key_a == key_b and tag_a == tag_b and key_a not in VISITED:
                        num_joint += 1
                        VISITED.append(key_a)
            return num_joint / (num_union - num_joint)

        chat_morphs = self.preprocessor.get_morphs(chat)

        for each in question_list:
            question_morphs = self.preprocessor.get_morphs(each.text)
            distance_dict[each.text] = _calc_jaacard(chat_morphs,
                                                     question_morphs)

        return OrderedDict(
            sorted(distance_dict.items(), key=lambda t: t[1], reverse=True))

    def get_similarity(self, chat, keywords, analysis=False):
        assert chat is not None

        feature_vector = self.modelWrapper.similarity(chat)
        question_list = questions.find_by_keywords(keywords=keywords)
        if not question_list:  # 걸리는 키워드가 없는 경우 모두 다 비교 # search 로 넘어가는 것이, 성능적으로 좋을 듯
            # question_list = questions.find_all()
            return None
        # question_list = questions.find_all()

        distances = {}
        a_vector = self.get_weighted_average_vector(chat, feature_vector)
        if type(a_vector) != np.ndarray:
            return None

        for question in question_list:
            b_vector = self.get_weighted_average_vector(
                question.text, question.feature_vector)

            if self.CONFIG['distance'] == 'manhattan':
                distance = manhattan_distance(a_vector, b_vector)
            elif self.CONFIG['distance'] == 'euclidean':
                distance = euclidean_distance(a_vector, b_vector)
            elif self.CONFIG['distance'] == 'cosine':
                distance = cosine_similarity(a_vector, b_vector)
            else:
                raise Exception('CONFIG distance  measurement Error!')
            distances[question.text] = distance

        return OrderedDict(
            sorted(distances.items(), key=lambda t: t[1],
                   reverse=True))  # 유클리드 할거면 바꿔야되

    def get_weighted_average_vector(self, text, vector):
        if len(vector.shape) == 1:
            return vector
        assert len(vector.shape) == 2

        text, _ = self.preprocessor.clean(text)
        tokens = self.preprocessor.str_to_tokens(text)

        idf_ = self._question_maker.idf_
        vocabulary_ = self._question_maker.vocabulary_
        output_vector = []

        for i, token in enumerate(tokens):

            idx = vocabulary_[token]
            idf = idf_[idx]
            # if token == '[UNK]':
            #     continue
            # elif idf == 1.0:
            #     output_vector.append(vector[i])
            #     continue
            # else:
            vector[i] += vector[i] * idf * self.CONFIG['idf_weight']
            output_vector.append(vector[i])

        if output_vector:
            output_vector = np.sum(output_vector, axis=0)
            return output_vector
        else:
            return np.array([0.0] * 768)

예제 #2

파일 보기

파일: search.py 프로젝트: taeuk-gang/chatbot-backend

class Search(metaclass=Singleton):
    def __init__(self):
        self.tfidf_matrix = None
        self.contexts_list = None
        self.CONFIG = config.SEARCH

        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words=None, sublinear_tf=self.CONFIG['sublinear_tf'])
        self.preprocessor = PreProcessor()
        self.set_context()
        self.set_tfidf_matrix()
        self.tensor_server = TensorServer()

    def response(self, chat):
        # context TF IDF 로 찾기
        output = self.find_context(chat)
        context = output['context-1']
        score = output['score-1']
        if score == 0:
            return None, None
        answer = self.tensor_server.search(chat, context)

        return answer, output

    def response_with_subject(self, _chat, _subject):
        context = contexts.find_by_subject(_subject=_subject)
        context = context['text']
        answer = self.tensor_server.search(chat=_chat, context=context)

        return answer, context

    def response_with_context(self, _chat, _context):
        answer = self.tensor_server.search(chat=_chat, context=_context)

        return answer

    def response_with_id(self, _chat, _id):
        context = contexts.find_by_id(_id=_id)['text']

        return self.tensor_server.search(chat=_chat, context=context), context

    def set_tfidf_matrix(self):
        text_list = list(
            map(lambda x: ' '.join(self.preprocessor.get_keywords(x['text'])),
                self.contexts_list))
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(
            text_list).todense().tolist()

    def set_context(self):
        self.contexts_list = list(contexts.find_all())

    def find_context(self, chat):
        chat = ' '.join(self.preprocessor.get_keywords(chat))
        chat_tfidf = self.tfidf_vectorizer.transform([chat
                                                      ]).todense().tolist()[0]
        num_context = len(self.tfidf_matrix)

        score = 0

        ordered_list = []

        output = {
            'context_subject-1': None,
            'context_subject-2': None,
            'context_subject-3': None,
            'context-1': None,
            'context-2': None,
            'context-3': None,
            'score-1': None,
            'score-2': None,
            'score-3': None
        }

        for i in range(num_context):
            context_tfidf = self.tfidf_matrix[i]
            num_context_voca = len(context_tfidf)
            for j in range(num_context_voca):
                score += chat_tfidf[j] * context_tfidf[j]
            ordered_list.append((i, score))
            score = 0

        ordered_list = sorted(ordered_list, key=lambda x: x[1], reverse=True)
        for i in range(self.CONFIG['max_context_num']):
            output['context_subject-{}'.format(i + 1)] = self.get_context(
                ordered_list[i][0])['subject']
            output['score-{}'.format(i + 1)] = ordered_list[i][1]
            output['context-{}'.format(i + 1)] = self.get_context(
                ordered_list[i][0])['text']

        return output

    def get_context(self, idx):
        return self.contexts_list[idx]

예제 #3

파일 보기

class QuestionMaker(object):
    def __init__(self):
        self.CONFIG = config.QUESTION
        self.model_wrapper = TensorServer()
        self.preprocessor = PreProcessor()
        vocab = self.preprocessor.vocab[:-1]
        self.tfidf_vectorizer = TfidfVectorizer(
            smooth_idf=True,
            token_pattern=self.CONFIG['tfidf_token_pattern'],
            stop_words=None,
            vocabulary=vocab)
        self.idf_, self.vocabulary_ = self.set_idf()

    def create_question(self, text, answer=None, category=None):
        text, removed = self.preprocessor.clean(text)

        if category not in self.CONFIG['categories']:
            raise Exception('category must be ', self.CONFIG['categories'])

        keywords = self.preprocessor.get_keywords(text=text)
        morphs = self.preprocessor.get_morphs(text=text)
        vector = self.model_wrapper.similarity(text)  # ELMO LIKE

        return Question(text,
                        category,
                        answer,
                        vector,
                        morphs,
                        keywords=keywords)

    def set_idf(self):
        question_list = _questions.find_all()

        raw_documents = []

        for question in question_list:
            text = ' '.join(self.preprocessor.str_to_tokens(question.text))
            raw_documents.append(text)

        self.tfidf_vectorizer.fit_transform(raw_documents=raw_documents)
        idf_ = self.tfidf_vectorizer.idf_
        # idf_ /= max(self.tfidf_vectorizer.idf_)  # 최대값으로 정규화
        return idf_, self.tfidf_vectorizer.vocabulary_

    def insert_text(self, text, answer=None, category=None):
        question = self.create_question(text, answer, category)
        return _questions.insert(question)

    def rebase(self):
        questions = _questions.find_all()

        for question in questions:

            backup = None
            orig_text = question.text
            try:
                backup = question
                question = self.create_question(text=question.text,
                                                category=question.category,
                                                answer=question.answer)
                _questions.delete_by_text(orig_text)
                _questions.insert(question)
                print('rebase: {}'.format(question.text))
            except Exception as err:
                print('rebase: ', str(err))
                if backup:
                    _questions.insert(backup)
                return

    def check_idf(self, word):
        return self.idf_[self.vocabulary_[word]]