예제 #1
0
    def __init__(self, entity_name=None):
        """
        Initializes a TextDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.text = None
        self.regx_to_process = Regex([(r'[\'\/]', r'')])
        self.text_dict = {}
        self.tagged_text = None
        self.text_entity = []
        self.original_text_entity = []
        self.processed_text = None
        self.entity_name = entity_name
        self.tag = '__' + self.entity_name + '__'

        # defaults for auto mode
        self._fuzziness = "auto:4,7"
        self._fuzziness_lo, self._fuzziness_hi = 4, 7
        self._min_token_size_for_fuzziness = self._fuzziness_lo
        # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi))

        # defaults for non-auto mode
        self.set_fuzziness_threshold(fuzziness=1)
        self._min_token_size_for_fuzziness = 4

        self.db = DataStore()
예제 #2
0
    def __init__(self, entity_name, timezone=pytz.timezone('UTC')):
        """
        Initializes the DateAdvanceDetector object with given entity_name and pytz timezone object

        Args:
            entity_name: A string by which the detected date entity substrings would be replaced with on calling
                        detect_entity()
            timezone: Optional, pytz.timezone object used for getting current time, default is pytz.timezone('UTC')
        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.date = []
        self.original_date_text = []
        self.regx_to_process = Regex([(r'[\/]', r'')])
        self.regx_to_process_text = Regex([(r'[\,]', r'')])
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        self.date_detector_object = DateDetector(entity_name=self.entity_name,
                                                 timezone=timezone)
        self.bot_message = None
예제 #3
0
    def __init__(self, entity_name=None):
        """
        Initializes a TextDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.text = None
        self.regx_to_process = Regex([(r'[\'\/]', r'')])
        self.text_dict = {}
        self.fuzziness_threshold = 1
        self.min_size_token_for_levenshtein = 4
        self.tagged_text = None
        self.text_entity = []
        self.original_text_entity = []
        self.processed_text = None
        self.entity_name = entity_name
        self.tag = '__' + self.entity_name + '__'
예제 #4
0
    def __init__(self, entity_name):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []

        regex_for_thousand = [(r'(\d+)k', r'\g<1>000')]
        self.regex_object = Regex(regex_for_thousand)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)
예제 #5
0
    def __init__(self,
                 entity_name=None,
                 source_language_script=ENGLISH_LANG,
                 translation_enabled=False):
        """
        Initializes a TextDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
            source_language_script: ISO 639 code for language of entities to be detected by the instance of this class
            translation_enabled: True if messages needs to be translated in case detector does not support a
                                 particular language, else False
        """
        # assigning values to superclass attributes
        self._supported_languages = [ENGLISH_LANG, HINDI_LANG]
        super(TextDetector, self).__init__(source_language_script,
                                           translation_enabled)

        self.text = None
        self.regx_to_process = Regex([(r'[\'\/]', r'')])
        self.text_dict = {}
        self.tagged_text = None
        self.text_entity = []
        self.original_text_entity = []
        self.processed_text = None
        self.entity_name = entity_name
        self.tag = '__' + self.entity_name + '__'

        # defaults for auto mode
        self._fuzziness = "auto:4,7"
        self._fuzziness_lo, self._fuzziness_hi = 4, 7
        self._min_token_size_for_fuzziness = self._fuzziness_lo
        # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi))

        # defaults for non-auto mode
        self.set_fuzziness_threshold(fuzziness=1)
        self._min_token_size_for_fuzziness = 4

        self.db = DataStore()
def combine_output_of_detection_logic_and_tag(entity_data, text):
    """NER is often used to tag the chat so it can be used in disambiguation process. Also, many times one entity may
    overlap with another.
    For example: "I want to order from Delhi Dhaba" and we want to detect two entities i.e. restaurant and city.
    So, first we will run individual detection logic of restaurant and city and from this we are able to derive two
    entity values i.e. Delhi Dhaba (restaurant) and Delhi (city) but we see that entity city is irrelevant in above
    case because message is about ordering from restaurant. So it necessary to process the output which is obtained by
    running individual detection logic and keep the relevant entities.

    Attributes:
        text: a message on which detection logic needs to run. For example "i want to order form  delhi dhaba"
        entity_data: dictionary containing key as entity_name and value as a output from entities detection logic.
        For example:
            {
            "restaurant":
                [
                    {
                        "detection": "chat",
                        "original_text": "delhi dhaba",
                        "entity_value":"Delhi Dhaba"
                    }
                ],
            "city":
                [
                    {
                        "detection": "chat",
                        "original_text": "delhi",
                        "entity_value":"New Delhi"
                    }
                ]
        }

    Output:
        will be list of dictionary
        {
            'entity_data':  PROCESSED_ENTITY_DICTIONARY,
            'tag': TAGGED_TEXT

        }

        entity_data will be processed dictionary of entities containg valid entity value and will remove the ambiguity
        tagged_text will be the tagged_data
        For example:
          {
            "entity_data":
                {
                    "restaurant":
                        [
                            {
                                "detection": "chat",
                                "original_text": "delhi dhaba",
                                "entity_value":"Delhi Dhaba"
                            }
                        ],
                    "city":
                        [
                            {
                                "detection": "chat",
                                "original_text": "delhi",
                                "entity_value":"New Delhi"
                            }
                        ]
                },
            "tagged_text": "i want to order from __restaurant__"
          }

    """
    regex = Regex([(r'[\'\/]', r'')])
    text = regex.text_substitute(text)
    final_entity_data = defaultdict(list)
    tagged_text = text.lower()
    processed_text = text.lower()
    tag_preprocess_dict = defaultdict(list)
    for entity, entity_list in entity_data.iteritems():
        if entity_list:
            for entity_identified in entity_list:
                if entity_identified[ORIGINAL_TEXT] and \
                                entity_identified[DETECTION_METHOD] in [FROM_MESSAGE, FROM_MODEL_VERIFIED,
                                                                        FROM_MODEL_NOT_VERIFIED]:
                    tag_preprocess_dict[
                        entity_identified[ORIGINAL_TEXT].lower()].append(
                            [entity_identified, entity])
                else:
                    tag_preprocess_dict['NA'].append(
                        [entity_identified, entity])
        else:
            final_entity_data[entity] = None

    original_text_list = tag_preprocess_dict.keys()
    original_text_list = sort_original_text(original_text_list)
    for original_text in original_text_list:
        tag = ''
        if original_text in processed_text:
            processed_text = processed_text.replace(original_text, '')
            for entity_dict, entity in tag_preprocess_dict[original_text]:
                tag += '_' + entity
                if final_entity_data[entity]:
                    final_entity_data[entity].append(entity_dict)
                else:
                    final_entity_data[entity] = [entity_dict]
            if tag != '':
                tag = '_' + tag + '__'
            tagged_text = tagged_text.replace(original_text, tag)
        else:
            for entity_dict, entity in tag_preprocess_dict[original_text]:
                if not final_entity_data[entity]:
                    final_entity_data[entity] = None

    if tag_preprocess_dict.get('NA'):
        for entity_dict, entity in tag_preprocess_dict['NA']:
            if final_entity_data[entity]:
                final_entity_data[entity].append(entity_dict)
            else:
                final_entity_data[entity] = [entity_dict]

    return {'entity_data': final_entity_data, 'tag': tagged_text}
예제 #7
0
import os
from lib.nlp.etc import store_data_in_list
from lib.nlp.lemmatizer import Lemmatizer, WORDNET_LEMMATIZER
from lib.nlp.ngram import Ngram
from lib.nlp.stemmer import Stemmer, PORTER_STEMMER
from lib.nlp.tokenizer import Tokenizer, PRELOADED_NLTK_TOKENIZER
from lib.nlp.regex import Regex
from chatbot_ner.settings import BASE_DIR

stemmer = Stemmer(PORTER_STEMMER)
lemmatizer = Lemmatizer(WORDNET_LEMMATIZER)
tokenizer = Tokenizer(PRELOADED_NLTK_TOKENIZER)

# Creating list of stop words
stop_word_path = os.path.join(
    BASE_DIR, 'lib', 'nlp', 'data',
    'stop_words.csv')  # file containing words to remove
stop_words = store_data_in_list(stop_word_path)

ngram_object = Ngram()

punctuation_removal_list = [(r'[^\w\'\/]', r' '), (r'\'', r'')]
regx_punctuation_removal = Regex(punctuation_removal_list)