Exemplos de tokenize em Python, exemplos de lib.nlp.const.nltk_tokenizer.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: name_detection.py Projeto: x0rzkov/chatbot_ner

    def replace_detected_text(self, text_detection_result, text):
        """
        Replaces the detected name from text_detection_result by _<name>_
        Args:
            text_detection_result: tuple of detected names from TextDetection
            consisting of two lists
            1.The variants detected
            2.The original text
            ([u'dosh', u'yash'], ['doshi', 'yash'])

            Example:
                    text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash'])
            Returns:
                    ['my', 'name', 'is', '_yash_', '_doshi_']

        """
        replaced_text_tokens = []
        if self.language == ENGLISH_LANG:
            replaced_text_tokens = nltk_tokenizer.tokenize(text.lower())
        elif self.language == HINDI_LANG:
            replaced_text_tokens = text.lower().strip().split()

        for detected_original_text in (text_detection_result[1]):
            for j in range(len(replaced_text_tokens)):
                replaced_text_tokens[j] = replaced_text_tokens[j].replace(
                    detected_original_text, "_" + detected_original_text + "_")

        return replaced_text_tokens

Exemplo n.º 2

0

Exibir arquivo

Arquivo: name_detection.py Projeto: x0rzkov/chatbot_ner

    def replace_predetected_text(self, predetected_values, text):
        """
        Replace detected names from the text according to replace_detected_text.
        Separate method for replacing predetected_values because it these results are not at token level.
        For example -
            text = "my name is yash doshi"
            predetected_values = ["yash doshi"]
            while, text_detection_original_texts = ["yash", "doshi"]


        Args:
            predetected_values(list): list containing predetected_values
            text(str): original to run detection on

        Returns:
            replaced_text(str): text with marked tokens

        Example:
            >> text = "my name is yash doshi"
            >> predetected_values = ["yash doshi"]
            >> replace_predetected_text(predetected_values, text)
            'my name is _yash_ _doshi_'

        """
        if self.language == ENGLISH_LANG:
            replaced_original_text_tokens = nltk_tokenizer.tokenize(
                text.lower())
            replaced_text_tokens = []
            for index, token in enumerate(replaced_original_text_tokens):
                # Fix to handle tokenizer error for tokens with trailing `.`. For eg.
                # >> text = "my name is v.k. singh"
                # >> tokens = tokenize(text)
                # >> tokens
                #    ["my", "name", "is", "v.k", ".", "singh"]
                # this extra `.` token causes problem while training.
                if token == "." and 0 < index < len(replaced_original_text_tokens) - 1 \
                        and replaced_text_tokens[-1] + "." in text.lower():
                    replaced_text_tokens[-1] = replaced_text_tokens[-1] + "."
                else:
                    # Fix to handle examples like `miami,21st street`
                    # where tokenizer gives ["miami,", "21st", "street"].
                    # This causes problems while tagging entities according indices.
                    # For eg is miami is an entity and its indices are (0,5) then due to this extra `,` tagging will be
                    # problem because now length of token will become 6 not 5.
                    _token = token.strip('!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~')
                    if not _token:
                        _token = token
                    replaced_text_tokens.append(_token)
        else:
            replaced_text_tokens = text.lower().strip().split()

        for name in predetected_values:
            name_tokens = name.split()
            for token in name_tokens:
                for j in range(len(replaced_text_tokens)):
                    replaced_text_tokens[j] = replaced_text_tokens[j].replace(
                        token, "_" + token + "_")

        return replaced_text_tokens

Exemplo n.º 3

0

Exibir arquivo

    def add_data_to_tagger(self, bot_message, user_message):
        """
        As explained, CRF need data in a particular format, this function converts the bot_message and user_message
        into that format and add it to the tagger.

        Args:
            bot_message: message from bot
            user_message: message from user

        for Example:
            Args:
                bot_message = 'none'
                user_message = 'flights from delhi to goa'

            Then this functions tokenize the bot and user messages, gets the POS tags, tags them as outbound or
            inbound as per the sender and adds it to the tagger object.

            tokens_bot_message = ['none']
            tokens_user_message = ['flights', 'from', 'delhi', 'goa']
            pos_bot_message = [['none', 'NN']]
            pos_user_message = [['flights','NNS'], ['from', 'VBP'], ['delhi', 'NN'], ['to', 'TO'], ['goa', 'VB']]

            none NN o
            flights NNS i
            from VBP i
            delhi NN i
            to TO i
            goa VB i
        """
        if bot_message is None:
            bot_message = ''

        tokens_bot_message = nltk_tokenizer.tokenize(bot_message)
        tokens_user_message = nltk_tokenizer.tokenize(user_message)

        pos_bot_message = self.pos_tagger.tag(tokens_bot_message)
        pos_user_message = self.pos_tagger.tag(tokens_user_message)
        for token in pos_bot_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + OUTBOUND)

        for token in pos_user_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + INBOUND)

Exemplo n.º 4

0

Exibir arquivo

    def replace_detected_text(self, text_detection_result):
        """
        Replaces the detected name from text_detection_result by _<name>_
        Args:
            text_detection_result: tuple of detected names from TextDetection
            consisting of two lists
            1.The variants detected
            2.The original text
            ([u'dosh', u'yash'], ['doshi', 'yash'])

            Example:
                    text_detection_result= ([u'dosh', u'yash'], ['doshi', 'yash'])
            Returns:
                    ['my', 'name', 'is', 'yash', 'doshi']

        """

        replaced_text = nltk_tokenizer.tokenize(self.text.lower())
        for detected_original_text in (text_detection_result[1]):
            for j in range(len(replaced_text)):
                replaced_text[j] = replaced_text[j].replace(
                    detected_original_text, "_" + detected_original_text + "_")

        return replaced_text