Python tokenize示例，lib.nlp.const.tokenizer.tokenize Python示例

示例#1

0

显示文件

    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             A tuple of two lists with first list containing the detected text entities and second list containing
             their corresponding substrings in the original message.
        """
        original_final_list = []
        value_final_list = []
        normalization = Normalization()
        self.text_dict = normalization.ngram_data(
            self.processed_text.lower(),
            flag_punctuation_removal=False,
            stem_unigram=False,
            stem_bigram=False,
            stem_trigram=False,
            stop_words_unigram=True,
            stop_words_bigram=True,
            stop_words_trigram=True).copy()
        variant_dictionary = {}

        trigram_variants = self.db.get_similar_ngrams_dictionary(
            self.entity_name, self.text_dict['trigram'], self._fuzziness)
        bigram_variants = self.db.get_similar_ngrams_dictionary(
            self.entity_name, self.text_dict['bigram'], self._fuzziness)
        unigram_variants = self.db.get_similar_ngrams_dictionary(
            self.entity_name, self.text_dict['unigram'], self._fuzziness)
        variant_dictionary.update(trigram_variants)
        variant_dictionary.update(bigram_variants)
        variant_dictionary.update(unigram_variants)
        variant_list = variant_dictionary.keys()

        exact_matches, fuzzy_variants = [], []
        for variant in variant_list:
            if variant.lower() in self.processed_text.lower():
                exact_matches.append(variant)
            else:
                fuzzy_variants.append(variant)

        exact_matches.sort(key=lambda s: len(tokenizer.tokenize(s)),
                           reverse=True)
        fuzzy_variants.sort(key=lambda s: len(tokenizer.tokenize(s)),
                            reverse=True)
        variant_list = exact_matches + fuzzy_variants

        for variant in variant_list:
            original_text = self._get_entity_from_text(
                variant, self.processed_text.lower())
            if original_text:
                value_final_list.append(variant_dictionary[variant])
                original_final_list.append(original_text)
                self.processed_text = re.sub(r'\b' + original_text + r'\b',
                                             self.tag, self.processed_text)

        return value_final_list, original_final_list

示例#2

0

显示文件

    def _get_entity_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_tokens = tokenizer.tokenize(variant.lower())
        text_tokens = tokenizer.tokenize(text.lower())
        original_text = []
        variant_count = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_count]

            utext_token = text_token
            if type(utext_token) == 'str':
                utext_token = utext_token.decode('utf-8')

            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(utext_token)
            if same or (len(utext_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text.append(text_token)
                variant_count += 1
                if variant_count == len(variant_tokens):
                    return ' '.join(original_text)
            else:
                original_text = []
                variant_count = 0
        return None

示例#3

0

显示文件

    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             A tuple of two lists with first list containing the detected text entities and second list containing
             their corresponding substrings in the original message.
        """
        original_final_list = []
        value_final_list = []
        variant_dictionary = {}

        tokens = tokenizer.tokenize(self.processed_text)
        message = u' '.join(tokens)
        variants = self.db.get_similar_dictionary(
            self.entity_name,
            message,
            self._fuzziness,
            search_language_script=self._target_language_script)
        variant_dictionary.update(variants)
        variant_list = variant_dictionary.keys()

        exact_matches, fuzzy_variants = [], []
        for variant in variant_list:
            if variant.lower() in self.processed_text.lower():
                exact_matches.append(variant)
            else:
                fuzzy_variants.append(variant)

        exact_matches.sort(key=lambda s: len(tokenizer.tokenize(s)),
                           reverse=True)
        fuzzy_variants.sort(key=lambda s: len(tokenizer.tokenize(s)),
                            reverse=True)
        variant_list = exact_matches + fuzzy_variants

        for variant in variant_list:
            original_text = self._get_entity_from_text(
                variant, self.processed_text.lower())
            if original_text:
                value_final_list.append(variant_dictionary[variant])
                original_final_list.append(original_text)
                _pattern = re.compile(r'\b%s\b' % original_text, re.UNICODE)
                self.tagged_text = _pattern.sub(self.tag, self.tagged_text)
                # Instead of dropping completely like in other entities,
                # we replace with tag to avoid matching non contiguous segments
                self.processed_text = _pattern.sub(self.tag,
                                                   self.processed_text)
        return value_final_list, original_final_list

示例#4

0

显示文件

    def _get_entity_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_token_list = tokenizer.tokenize(variant.lower())
        text_token_list = tokenizer.tokenize(text.lower())
        original_text = []
        variant_count = 0
        token_count = 0
        while token_count < len(text_token_list):
            levenshtein = Levenshtein(variant_token_list[variant_count],
                                      text_token_list[token_count],
                                      self.fuzziness_threshold + 1)
            if variant_token_list[variant_count] == text_token_list[token_count] or \
                    (len(text_token_list[token_count]) > self.min_size_token_for_levenshtein and
                             levenshtein.levenshtein_distance() <= self.fuzziness_threshold):
                original_text.append(text_token_list[token_count])
                variant_count += 1
                if variant_count == len(variant_token_list):
                    return ' '.join(original_text)
            else:
                original_text = []
                variant_count = 0

            token_count += 1
        return None

示例#5

0

显示文件

文件： combine_detection_logic.py 项目： yangvict/chatbot_ner

def sort_original_text(original_text_list):
    """
    Sorts the original text list based on tokens and length of string
    :param original_text_list:
    :return:
    """
    final_original_text = []
    sort_original_text_dict = defaultdict(list)
    original_text_list.sort(key=lambda s: len(tokenizer.tokenize(s)),
                            reverse=True)
    for original in original_text_list:
        length_of_token = len(tokenizer.tokenize(original))
        sort_original_text_dict[length_of_token].append(original)
    for token_length in reversed(sorted(sort_original_text_dict.keys())):
        list_of_tokens = sort_original_text_dict[token_length]
        list_of_tokens.sort(key=lambda s: len(s), reverse=True)
        final_original_text.extend(list_of_tokens)
    return final_original_text

示例#6

0

显示文件

    def add_data_to_tagger(self, bot_message, user_message):
        """
        As explained, CRF need data in a particular format, this function converts the bot_message and user_message
        into that format and add it to the tagger.

        Args:
            bot_message: message from bot
            user_message: message from user

        for Example:
            Args:
                bot_message = 'none'
                user_message = 'flights from delhi to goa'

            Then this functions tokenize the bot and user messages, gets the POS tags, tags them as outbound or
            inbound as per the sender and adds it to the tagger object.

            tokens_bot_message = ['none']
            tokens_user_message = ['flights', 'from', 'delhi', 'goa']
            pos_bot_message = [['none', 'NN']]
            pos_user_message = [['flights','NNS'], ['from', 'VBP'], ['delhi', 'NN'], ['to', 'TO'], ['goa', 'VB']]

            none NN o
            flights NNS i
            from VBP i
            delhi NN i
            to TO i
            goa VB i
        """
        if bot_message is None:
            bot_message = ''

        tokens_bot_message = tokenizer.tokenize(bot_message)
        tokens_user_message = tokenizer.tokenize(user_message)

        pos_bot_message = self.pos_tagger.tag(tokens_bot_message)
        pos_user_message = self.pos_tagger.tag(tokens_user_message)
        for token in pos_bot_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + OUTBOUND)

        for token in pos_user_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + INBOUND)