예제 #1
0
    def get_name_using_pos_tagger(self, text):
        """
        First checks if the text contains cardinals or interrogation.
        Then passes the text through templates.
        Then returns words which are nouns or adjectives
        Args:
            text (string): The text obtained from the user.

            Example text= My name is yash modi
        Returns:
            [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"]
        """

        entity_value, original_text = [], []
        pos_tagger_object = POS()
        pattern1 = re.compile(r"name\s+(?:is\s+)?([\w\s]+)")
        pattern2 = re.compile(r"myself\s+([\w\s]+)")
        pattern3 = re.compile(r"call\s+me\s+([\w\s]+)")
        pattern4 = re.compile(r"i\s+am\s+([\w\s]+)")
        name_tokens = text.split()
        # Passing empty tokens to tag will cause IndexError
        tagged_names = pos_tagger_object.tag(name_tokens)
        pattern1_match = pattern1.findall(text)
        pattern2_match = pattern2.findall(text)
        pattern3_match = pattern3.findall(text)
        pattern4_match = pattern4.findall(text)

        is_question = [
            word[0] for word in tagged_names if word[1].startswith('WR')
            or word[1].startswith('WP') or word[1].startswith('CD')
        ]
        if is_question:
            return entity_value, original_text

        if pattern1_match:
            entity_value, original_text = self.get_format_name(
                pattern1_match[0].split(), self.text)

        elif pattern2_match:
            entity_value, original_text = self.get_format_name(
                pattern2_match[0].split(), self.text)

        elif pattern3_match:
            entity_value, original_text = self.get_format_name(
                pattern3_match[0].split(), self.text)

        elif pattern4_match:
            entity_value, original_text = self.get_format_name(
                pattern4_match[0].split(), self.text)

        elif len(name_tokens) < 4 and self.bot_message:
            pos_words = [
                word[0] for word in tagged_names
                if word[1].startswith('NN') or word[1].startswith('JJ')
            ]
            if pos_words:
                entity_value, original_text = self.get_format_name(
                    pos_words, self.text)

        return entity_value, original_text
예제 #2
0
    def get_name_using_pos_tagger(self, text):
        """
        First checks if the text contains cardinals or interrogation.
        Then passes the text through templates.
        Then returns words which are nouns or adjectives
        Args:
            text (string): The text obtained from the user.

            Example text= My name is yash modi
        Returns:
            [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"]
        """

        entity_value, original_text = [], []

        if self.language in EUROPEAN_LANGUAGES_SET:
            tagged_names = spacy_utils.tag(text=text.strip(),
                                           language=self.language)
        else:
            pos_tagger_object = POS()
            name_tokens = text.split()
            # Passing empty tokens to tag will cause IndexError
            tagged_names = pos_tagger_object.tag(name_tokens)

        is_question = [
            word[0] for word in tagged_names if word[1].startswith('WR')
            or word[1].startswith('WP') or word[1].startswith('CD')
        ]
        if is_question:
            return entity_value, original_text

        if len(tagged_names) < 4 and self.bot_message:
            if self.language in EUROPEAN_LANGUAGES_SET:
                pos_words = [
                    word[0] for word in tagged_names
                    if word[1].startswith('NOUN') or word[1].startswith('ADJ')
                    or word[1].startswith('PROPN')
                ]
            else:
                pos_words = [
                    word[0] for word in tagged_names
                    if word[1].startswith('NN') or word[1].startswith('JJ')
                ]

            if pos_words:
                entity_value, original_text = self.get_format_name(
                    pos_words, self.text)

        return entity_value, original_text
    def get_pos_tagged_dict(docs):
        """
        This method is used to apply pos_tags to every token
        Args:
            docs (dict): List of tuples consisting of the token and label in (token, label) form.
        Returns:
            data (dict): This method assigns pos_tags to the tokens
        Example:
            For city entity
            docs = {
            'labels': [['O', 'O', 'O', 'O', 'B', 'O', 'B'], ['O', 'O', 'O', 'O', 'B']],

            'text_list': [['book', 'a', 'flight', 'from', 'Mumbai', 'to', 'Delhi'],
                        ['Book', 'a', 'flight', 'to', 'Pune']]}

            pos_tag(docs)

            >> {

             'labels': [['O', 'O', 'O', 'O', 'B', 'O', 'B'], ['O', 'O', 'O', 'O', 'B']],

             'pos_tags': [['NN', 'DT', 'NN', 'IN', 'NNP', 'TO', 'VB'],
                         ['VB', 'DT', 'NN', 'TO', 'VB']],

             'text_list': [['book', 'a', 'flight', 'from', 'Mumbai', 'to', 'Delhi'],
                          ['Book', 'a', 'flight', 'to', 'Pune']]

                }
        """
        docs[CRF_POS_TAGS] = []
        pos_tagger = POS()
        for text in docs[SENTENCE_LIST]:
            docs[CRF_POS_TAGS].append(
                [tag[1] for tag in pos_tagger.tagger.tag(text)])

        return docs
예제 #4
0
 def __init__(self):
     self.tagger = None
     self._model_path = None
     self.pos_tagger = POS()
예제 #5
0
class PredictCRF(object):
    def __init__(self):
        self.tagger = None
        self._model_path = None
        self.pos_tagger = POS()

    def get_model_output(self, entity_type, bot_message, user_message):
        """
        This function is a calls all other in order get final json list of tagged data.
        
        If model has been loaded then it calls initialize_files(), add_data_to_tagger and run_crf to get the 
        tagged data otherwise it will throw an error message
        """
        output_list = []
        if MODEL_RUN:
            self.initialize_files(entity_type=entity_type)
            self.add_data_to_tagger(bot_message, user_message)
            crf_output = self.run_crf()
            if entity_type == CITY_ENTITY_TYPE:
                output_list = generate_city_output(crf_data=crf_output)
                ner_logger.debug('NER MODEL OUTPUT: %s' % output_list)
            elif entity_type == DATE_ENTITY_TYPE:
                output_list = generate_date_output(crf_data=crf_output)
                ner_logger.debug('NER MODEL OUTPUT: %s' % output_list)
        else:
            ner_logger.debug('MODEL IS NOT RUNNING: CRFPP not installed')

        return output_list

    def initialize_files(self, entity_type):
        """
        This function checks the type of entity.
        We have currently done it for entity_type='city'.
        If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the
        tagger and model_path accordingly

        Args:
            entity_type: type of entity

        """
        global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT
        if entity_type == CITY_ENTITY_TYPE:
            self._model_path = CITY_MODEL_PATH
            if not CITY_MODEL_OBJECT:
                CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('CITY CRF model loaded %s' % self._model_path)

            self.tagger = CITY_MODEL_OBJECT
        elif entity_type == DATE_ENTITY_TYPE:
            self._model_path = DATE_MODEL_PATH
            if not DATE_MODEL_OBJECT:
                DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('date CRF model loaded %s' % self._model_path)

            self.tagger = DATE_MODEL_OBJECT

    def add_data_to_tagger(self, bot_message, user_message):
        """
        As explained, CRF need data in a particular format, this function converts the bot_message and user_message
        into that format and add it to the tagger.

        Args:
            bot_message: message from bot
            user_message: message from user

        for Example:
            Args:
                bot_message = 'none'
                user_message = 'flights from delhi to goa'

            Then this functions tokenize the bot and user messages, gets the POS tags, tags them as outbound or
            inbound as per the sender and adds it to the tagger object.

            tokens_bot_message = ['none']
            tokens_user_message = ['flights', 'from', 'delhi', 'goa']
            pos_bot_message = [['none', 'NN']]
            pos_user_message = [['flights','NNS'], ['from', 'VBP'], ['delhi', 'NN'], ['to', 'TO'], ['goa', 'VB']]

            none NN o
            flights NNS i
            from VBP i
            delhi NN i
            to TO i
            goa VB i
        """
        if bot_message is None:
            bot_message = ''

        tokens_bot_message = nltk_tokenizer.tokenize(bot_message)
        tokens_user_message = nltk_tokenizer.tokenize(user_message)

        pos_bot_message = self.pos_tagger.tag(tokens_bot_message)
        pos_user_message = self.pos_tagger.tag(tokens_user_message)
        for token in pos_bot_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + OUTBOUND)

        for token in pos_user_message:
            self.tagger.add(
                str(token[0]) + ' ' + str(token[1]) + ' ' + INBOUND)

    def run_crf(self):
        """
        This function runs CRF on data added to tagger and stores the [word    predicted_label] in output list and
        returns it. This list is then passed to generate_crf_output() to get the json list of data tagged.

        """
        output = []
        self.tagger.parse()

        size = self.tagger.size()
        for i in range(0, size):
            output.append([self.tagger.x(i, 0), self.tagger.y2(i)])
        self.tagger.clear()
        return output