def __init__(self, entity_name=None): """ Initializes a TextDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.text = None self.regx_to_process = Regex([(r'[\'\/]', r'')]) self.text_dict = {} self.tagged_text = None self.text_entity = [] self.original_text_entity = [] self.processed_text = None self.entity_name = entity_name self.tag = '__' + self.entity_name + '__' # defaults for auto mode self._fuzziness = "auto:4,7" self._fuzziness_lo, self._fuzziness_hi = 4, 7 self._min_token_size_for_fuzziness = self._fuzziness_lo # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi)) # defaults for non-auto mode self.set_fuzziness_threshold(fuzziness=1) self._min_token_size_for_fuzziness = 4 self.db = DataStore()
def __init__(self, entity_name, timezone=pytz.timezone('UTC')): """ Initializes the DateAdvanceDetector object with given entity_name and pytz timezone object Args: entity_name: A string by which the detected date entity substrings would be replaced with on calling detect_entity() timezone: Optional, pytz.timezone object used for getting current time, default is pytz.timezone('UTC') """ self.text = '' self.tagged_text = '' self.processed_text = '' self.date = [] self.original_date_text = [] self.regx_to_process = Regex([(r'[\/]', r'')]) self.regx_to_process_text = Regex([(r'[\,]', r'')]) self.entity_name = entity_name self.tag = '__' + entity_name + '__' self.date_detector_object = DateDetector(entity_name=self.entity_name, timezone=timezone) self.bot_message = None
def __init__(self, entity_name=None): """ Initializes a TextDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() """ self.text = None self.regx_to_process = Regex([(r'[\'\/]', r'')]) self.text_dict = {} self.fuzziness_threshold = 1 self.min_size_token_for_levenshtein = 4 self.tagged_text = None self.text_entity = [] self.original_text_entity = [] self.processed_text = None self.entity_name = entity_name self.tag = '__' + self.entity_name + '__'
def __init__(self, entity_name): """Initializes a BudgetDetector object Args: entity_name: A string by which the detected budget would be replaced with on calling detect_entity() """ self.min_digit = 2 self.max_digit = 5 self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.budget = [] self.original_budget_text = [] regex_for_thousand = [(r'(\d+)k', r'\g<1>000')] self.regex_object = Regex(regex_for_thousand) self.tag = '__' + self.entity_name + '__' self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)
def __init__(self, entity_name=None, source_language_script=ENGLISH_LANG, translation_enabled=False): """ Initializes a TextDetector object with given entity_name Args: entity_name: A string by which the detected substrings that correspond to text entities would be replaced with on calling detect_entity() source_language_script: ISO 639 code for language of entities to be detected by the instance of this class translation_enabled: True if messages needs to be translated in case detector does not support a particular language, else False """ # assigning values to superclass attributes self._supported_languages = [ENGLISH_LANG, HINDI_LANG] super(TextDetector, self).__init__(source_language_script, translation_enabled) self.text = None self.regx_to_process = Regex([(r'[\'\/]', r'')]) self.text_dict = {} self.tagged_text = None self.text_entity = [] self.original_text_entity = [] self.processed_text = None self.entity_name = entity_name self.tag = '__' + self.entity_name + '__' # defaults for auto mode self._fuzziness = "auto:4,7" self._fuzziness_lo, self._fuzziness_hi = 4, 7 self._min_token_size_for_fuzziness = self._fuzziness_lo # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi)) # defaults for non-auto mode self.set_fuzziness_threshold(fuzziness=1) self._min_token_size_for_fuzziness = 4 self.db = DataStore()
def combine_output_of_detection_logic_and_tag(entity_data, text): """NER is often used to tag the chat so it can be used in disambiguation process. Also, many times one entity may overlap with another. For example: "I want to order from Delhi Dhaba" and we want to detect two entities i.e. restaurant and city. So, first we will run individual detection logic of restaurant and city and from this we are able to derive two entity values i.e. Delhi Dhaba (restaurant) and Delhi (city) but we see that entity city is irrelevant in above case because message is about ordering from restaurant. So it necessary to process the output which is obtained by running individual detection logic and keep the relevant entities. Attributes: text: a message on which detection logic needs to run. For example "i want to order form delhi dhaba" entity_data: dictionary containing key as entity_name and value as a output from entities detection logic. For example: { "restaurant": [ { "detection": "chat", "original_text": "delhi dhaba", "entity_value":"Delhi Dhaba" } ], "city": [ { "detection": "chat", "original_text": "delhi", "entity_value":"New Delhi" } ] } Output: will be list of dictionary { 'entity_data': PROCESSED_ENTITY_DICTIONARY, 'tag': TAGGED_TEXT } entity_data will be processed dictionary of entities containg valid entity value and will remove the ambiguity tagged_text will be the tagged_data For example: { "entity_data": { "restaurant": [ { "detection": "chat", "original_text": "delhi dhaba", "entity_value":"Delhi Dhaba" } ], "city": [ { "detection": "chat", "original_text": "delhi", "entity_value":"New Delhi" } ] }, "tagged_text": "i want to order from __restaurant__" } """ regex = Regex([(r'[\'\/]', r'')]) text = regex.text_substitute(text) final_entity_data = defaultdict(list) tagged_text = text.lower() processed_text = text.lower() tag_preprocess_dict = defaultdict(list) for entity, entity_list in entity_data.iteritems(): if entity_list: for entity_identified in entity_list: if entity_identified[ORIGINAL_TEXT] and \ entity_identified[DETECTION_METHOD] in [FROM_MESSAGE, FROM_MODEL_VERIFIED, FROM_MODEL_NOT_VERIFIED]: tag_preprocess_dict[ entity_identified[ORIGINAL_TEXT].lower()].append( [entity_identified, entity]) else: tag_preprocess_dict['NA'].append( [entity_identified, entity]) else: final_entity_data[entity] = None original_text_list = tag_preprocess_dict.keys() original_text_list = sort_original_text(original_text_list) for original_text in original_text_list: tag = '' if original_text in processed_text: processed_text = processed_text.replace(original_text, '') for entity_dict, entity in tag_preprocess_dict[original_text]: tag += '_' + entity if final_entity_data[entity]: final_entity_data[entity].append(entity_dict) else: final_entity_data[entity] = [entity_dict] if tag != '': tag = '_' + tag + '__' tagged_text = tagged_text.replace(original_text, tag) else: for entity_dict, entity in tag_preprocess_dict[original_text]: if not final_entity_data[entity]: final_entity_data[entity] = None if tag_preprocess_dict.get('NA'): for entity_dict, entity in tag_preprocess_dict['NA']: if final_entity_data[entity]: final_entity_data[entity].append(entity_dict) else: final_entity_data[entity] = [entity_dict] return {'entity_data': final_entity_data, 'tag': tagged_text}
import os from lib.nlp.etc import store_data_in_list from lib.nlp.lemmatizer import Lemmatizer, WORDNET_LEMMATIZER from lib.nlp.ngram import Ngram from lib.nlp.stemmer import Stemmer, PORTER_STEMMER from lib.nlp.tokenizer import Tokenizer, PRELOADED_NLTK_TOKENIZER from lib.nlp.regex import Regex from chatbot_ner.settings import BASE_DIR stemmer = Stemmer(PORTER_STEMMER) lemmatizer = Lemmatizer(WORDNET_LEMMATIZER) tokenizer = Tokenizer(PRELOADED_NLTK_TOKENIZER) # Creating list of stop words stop_word_path = os.path.join( BASE_DIR, 'lib', 'nlp', 'data', 'stop_words.csv') # file containing words to remove stop_words = store_data_in_list(stop_word_path) ngram_object = Ngram() punctuation_removal_list = [(r'[^\w\'\/]', r' '), (r'\'', r'')] regx_punctuation_removal = Regex(punctuation_removal_list)