def city(request): """This functionality calls the get_city() functionality to detect city. It is called through api call Attributes: request: url parameters """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_city(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], parameters_dict[PARAMETER_SOURCE_LANGUAGE] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for city: %s ' % e) return HttpResponse(status=500) except KeyError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) except es_exceptions.ConnectionTimeout as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) except es_exceptions.ConnectionError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def regex(request): """This functionality calls the get_regex() functionality to detect text those abide by the specified regex. It is called through api call Attributes: request: url parameters """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], parameters_dict[PARAMETER_REGEX], parameters_dict[PARAMETER_ASR], parameters_dict[PARAMETER_SOURCE_LANGUAGE] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for regex: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def get_crf_data_for_entity_name(self, entity_name, **kwargs): """ This method is used to obtain the sentences and entities from sentences given entity name Args: entity_name (str): Entity name for which training data needs to be obtained kwargs: For Elasticsearch: Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search Returns: results_dictionary(dict): Dictionary consisting of the training data for the the given entity. Raises: IndexNotFoundException if es_training_index was not found in connection settings Example: db = Datastore() db.get_entity_training_data(entity_name, **kwargs): >> { 'sentence_list': [ 'My name is hardik', 'This is my friend Ajay' ], 'entity_list': [ [ 'hardik' ], [ 'Ajay' ] ] } """ ner_logger.debug( 'Datastore, get_entity_training_data, entity_name %s' % entity_name) if self._client_or_connection is None: self._connect() results_dictionary = {} if self._engine == ELASTICSEARCH: es_training_index = self._connection_settings.get( ELASTICSEARCH_CRF_DATA_INDEX_NAME) if es_training_index is None: raise IndexNotFoundException( 'Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. ' 'Please configure the same') self._check_doc_type_for_crf_data_elasticsearch() request_timeout = self._connection_settings.get( 'request_timeout', 20) results_dictionary = elastic_search.query.get_crf_data_for_entity_name( connection=self._client_or_connection, index_name=es_training_index, doc_type=self. _connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE], entity_name=entity_name, request_timeout=request_timeout, **kwargs) ner_logger.debug( 'Datastore, get_entity_training_data, results_dictionary %s' % str(entity_name)) return results_dictionary
def detect_regex(self): """ Detects text based on the aforementioned regex Raises an error for an invalid regex Returns: (regex_list, original_list) (tuple): regex_list (list) : list of detected text for the specified text original_list (list) : list of original text provided by the user Example: self.regex = r'\d+' self.text = 'aman123" detect_entity() >> (['123'], ['123']) """ original_list = [] regex_list = [] try: compiled_regex = re.compile(self.regex) regex_list.append(compiled_regex.findall(self.text)[0]) original_list.extend(regex_list) self.update_processed_text(regex_list) except Exception as e: ner_logger.debug("Exception detect regex: %s" % e.message) return regex_list, original_list
def get_parameters_dictionary(request): # type: (django.http.HttpRequest) -> Dict[str, Any] """ Extract GET parameters from HTTP request Args: request (django.http.HttpRequest): HTTP response from url Returns: dict: GET parameters from the request """ parameters_dict = { PARAMETER_MESSAGE: request.GET.get('message'), PARAMETER_ENTITY_NAME: request.GET.get('entity_name'), PARAMETER_STRUCTURED_VALUE: request.GET.get('structured_value'), PARAMETER_FALLBACK_VALUE: request.GET.get('fallback_value'), PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'), PARAMETER_TIMEZONE: request.GET.get('timezone'), PARAMETER_REGEX: request.GET.get('regex'), PARAMETER_ASR: request.GET.get('is_asr', False), PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG), PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG), PARAMETER_FUZZINESS: request.GET.get('fuzziness'), PARAMETER_MIN_TOKEN_LEN_FUZZINESS: request.GET.get('min_token_len_fuzziness'), PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'), PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'), PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("predetected_values", '[]')) } ner_logger.debug("parameters dict - {}".format(parameters_dict)) return parameters_dict
def train_crf_model_from_list(self, sentence_list, entity_list, c1=0, c2=0, max_iterations=1000): """ This model is used to train the crf model. It performs the pre processing steps and trains the models Args: c1 (int): Coefficient of regularization to control variance and bias. c2 (int): Coefficient of regularization to control variance and bias. max_iterations (int): Max number of iterations to be carried out. sentence_list (list): List of sentences on which the NER task has to be carried out. entity_list (list): List of entities present in each sentence of the text_list. Returns: status (bool): Returns true if the training is successful. """ ner_logger.debug('Pre processing for Entity: %s started' % self.entity_name) x, y = CrfPreprocessData.preprocess_crf_text_entity_list( sentence_list=sentence_list, entity_list=entity_list, read_embeddings_from_remote_url=self. read_embeddings_from_remote_url) ner_logger.debug('Pre processing for Entity: %s completed' % self.entity_name) model_path = self.train_crf_model(x, y, c1, c2, max_iterations) return model_path
def budget(request): """This functionality calls the get_budget() functionality to detect budget. It is called through api call Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_budget(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], parameters_dict[PARAMETER_MIN_DIGITS], parameters_dict[PARAMETER_MAX_DIGITS]) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for budget: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def test_number_range_detection_from_csv(self): df = pd.read_csv(self.csv_path, encoding='utf-8', keep_default_na=False) for language, language_tests_df in df.groupby(by=['language']): ner_logger.debug('Running tests for language {}'.format(language)) for index, row in language_tests_df.iterrows(): message = row['message'] unit_type = None if row['unit_type'] == 'NA' else row[ 'unit_type'] number_range_detector = NumberRangeDetector( entity_name='number_range', language=language, unit_type=unit_type) expected_entity_values_list, expected_original_texts_list = \ self._make_expected_output(row['min_value'], row['max_value'], row['unit'], row['original_text']) expected_zipped = list( zip(expected_entity_values_list, expected_original_texts_list)) detected_entities_values_list, detected_original_texts_list = \ number_range_detector.detect_entity(message) for detected_number_range in zip(detected_entities_values_list, detected_original_texts_list): self.assertIn(detected_number_range, expected_zipped)
def get_timezone(timezone, ignore_errors=True): # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo """ Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz timezone object with it. If an invalid timezone is mentioned and `ignore_errors` is True, an UTC timezone object will be returned. If `timezone` is already a datetime.tzinfo object it will be returned as is Args: timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When set to False, raise exception when invalid timezone is given. Defaults to True. Returns: datetime.tzinfo: A pytz timezone object """ if (not isinstance(timezone, six.string_types) and isinstance(timezone, tzinfo) and hasattr(timezone, 'localize')): return timezone try: timezone = pytz.timezone(timezone) except Exception as e: if ignore_errors: ner_logger.debug('Timezone error: %s ' % e) timezone = pytz.timezone('UTC') ner_logger.debug('Using "UTC" as default timezone') else: raise return timezone
def person_name(request): """This functionality calls the get_name() functionality to detect name. It is called through api call Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_person_name( message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE]) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def initialize_files(self, entity_type): """ This function checks the type of entity. We have currently done it for entity_type='city'. If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the tagger and model_path accordingly Args: entity_type: type of entity """ global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT if entity_type == CITY_ENTITY_TYPE: self._model_path = CITY_MODEL_PATH if not CITY_MODEL_OBJECT: CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('CITY CRF model loaded %s' % self._model_path) self.tagger = CITY_MODEL_OBJECT elif entity_type == DATE_ENTITY_TYPE: self._model_path = DATE_MODEL_PATH if not DATE_MODEL_OBJECT: DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('date CRF model loaded %s' % self._model_path) self.tagger = DATE_MODEL_OBJECT
def parse_post_request(request): # type: (django.http.HttpRequest) -> Dict[str, Any] """ Extract POST request body from HTTP request Args: request (django.http.HttpRequest): HTTP response from url Returns: dict: parameters from the request """ request_data = json.loads(request.body) parameters_dict = { PARAMETER_MESSAGE: request_data.get('message'), PARAMETER_ENTITY_NAME: request_data.get('entity_name'), PARAMETER_STRUCTURED_VALUE: request_data.get('structured_value'), PARAMETER_FALLBACK_VALUE: request_data.get('fallback_value'), PARAMETER_BOT_MESSAGE: request_data.get('bot_message'), PARAMETER_TIMEZONE: request_data.get('timezone'), PARAMETER_REGEX: request_data.get('regex'), PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG), PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG), PARAMETER_FUZZINESS: request_data.get('fuzziness'), PARAMETER_MIN_TOKEN_LEN_FUZZINESS: request_data.get('min_token_len_fuzziness'), PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'), PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'), PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')), PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')), PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'), PARAMETER_PRIOR_RESULTS: request_data.get("predetected_values", []) } ner_logger.debug("parameters dict - {}".format(parameters_dict)) return parameters_dict
def passenger_count(request): """This functionality calls the get_passenger_count() functionality to detect passenger count. It is called through api call Args: request (django.http.request.HttpRequest): HttpRequest object Returns: response (django.http.response.HttpResponse): HttpResponse object """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_passenger_count(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for passenger count: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def number_range(request): """Use NumberDetector to detect numerals Args: request: url parameters: request params: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. unit_type(str): restrict number range to detect for some unit types like 'currency', 'temperature' Returns: HttpResponse: Response containing dictionary having containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity Examples: message = "we expect 200-300 people in room" entity_name = 'people_range' structured_value = None fallback_value = None bot_message = None unit_type=None output = number_range(request) print output >> [{'detection': 'message', 'original_text': '200-300', 'entity_value': {'min_value': '200', 'max_value': '300', 'unit': None}}] """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) number_range_detector = NumberRangeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE]) entity_output = number_range_detector.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def _detect_time_with_coln_format(self, time_list, original_list): """ This method is used to detect a specific time format of the form <hh>:<mm> 1. कल 5:30 बजे 2. आज १०:१५ बजे अजना Args: time_list (list): list of dicts consisting of the detected time entity original_list (list): list consisting of the origin subtext which is detected as time entity Returns: time_list (list): list of dicts consisting of the detected time entity original_list (list): list consisting of the origin subtext which is detected as time entity Example: >>> time_list = [] >>> original_list = [] >>> preprocessed_text = u'आज 05:40 बजे अजना' >>> self._detect_time_with_coln_format(time_list, original_list) >>> ([{'hh': 5, 'mm': 40, 'nn': 'pm', 'time_type': None}], ["05:40"]) """ patterns = re.findall(r'\s*((\d+)\:(\d+))\s*', self.processed_text.lower(), re.U) if time_list is None: time_list = [] if original_list is None: original_list = [] for pattern in patterns: t1 = pattern[1] t2 = pattern[2] original = pattern[0] if len(t1) <= 2 and len(t2) <= 2: hh = int(t1) mm = int(t2) time = { 'hh': hh, 'mm': mm, 'tz': None if not self.timezone else self.timezone.zone, 'time_type': None } nn = self._get_meridiem(hh, mm, original) time.update({'nn': nn}) original_list.append(original) time_list.append(time) ner_logger.debug("time_list %s" % str(time_list)) ner_logger.debug("original_list %s" % str(original_list)) return time_list, original_list
def _check_doc_type_for_elasticsearch(self): """ Checks if doc_type is present in connection settings, if not an exception is raised Raises: DataStoreSettingsImproperlyConfiguredException if doc_type was not found in connection settings """ # TODO: This check should be during init or boot if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings: ner_logger.debug("No doc type is present") raise DataStoreSettingsImproperlyConfiguredException( 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment' )
def get_regex(message, entity_name, structured_value, fallback_value, bot_message, regex): """This functionality calls the RegexDetector class to detect text that abide by the specified regex. The meta_data consists the regex Attributes: NOTE: Explained above meta_data (dict) : It consists of the regex Output: NOTE: Explained above Example: message = 'abc123' entity_name = 'regex' meta_data = {'regex': '\d'} structured_value = None fallback_value = None bot_message = None output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, meta_data=meta_data) print output >> [{'detection': 'message', 'original_text': '1', 'entity_value': {'value': '1'}}] """ ner_logger.debug("BEFORE AST LITERAL REGEX>>>>>>%s" % regex) ner_logger.debug("REGEX>>>>>>%s" % regex) regex_detection = RegexDetector(entity_name=entity_name, regex=regex) if structured_value: entity_list, original_text_list = regex_detection.detect_entity( text=structured_value) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: entity_list, original_text_list = regex_detection.detect_entity( text=message) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE) return None
def combine_output(request): """This functionality calls the combine_output_of_detection_logic_and_tag() through api call Attributes: request: url parameters """ message = request.GET.get('message') entity_data = request.GET.get('entity_data', '{}') entity_data_json = json.loads(entity_data) ner_logger.debug('Start: %s ' % message) output = combine_output_of_detection_logic_and_tag(entity_data=entity_data_json, text=message) ner_logger.debug('Finished %s : %s ' % (message, output)) return HttpResponse(json.dumps({'data': output}), content_type='application/json')
def check_if_index_exits(es_url, index_name): """ This function checks if index exists in es_url Args es_url (string): The elsticsearch URL index_name (string): Name of the index to check the existence for """ index_response = requests.get('{es_url}/_cat/indices?v'.format(**{"es_url": es_url})) # check if index is present in source if " " + index_name + " " not in index_response.content: message = index_name + " does not exist in " + es_url ner_logger.debug("check_if_index_exits - " + str(message)) raise IndexNotFoundException(message)
def load_model(self, model_path=None, live_crf_model_path=None): """ Method that will load model data for entity and initialize the tagger for the same. If no model_path is given data will be loaded from the S3 with the path from redis Args: model_path (str): Path from where model has to be loaded for the given entity. live_crf_model_path (str): Live path for the Crf Model Returns: tagger (pycrfsuite.Tagger()): Tagger with the loaded model """ if model_path: file_handler = open(model_path, 'r') self.entity_model_dict = file_handler.read() ner_logger.debug('Model dir %s path from local' % model_path) return self.initialize_tagger() ner_logger.debug('Model dir %s path from api' % live_crf_model_path) if live_crf_model_path == self.loaded_model_path: if not self.entity_model_dict: self.entity_model_dict = read_model_dict_from_s3(bucket_name=CRF_MODEL_S3_BUCKET_NAME, bucket_region=CRF_MODEL_S3_BUCKET_REGION, model_path_location=live_crf_model_path) ner_logger.debug('New Model dir %s path from api' % live_crf_model_path) else: return self.tagger else: self.entity_model_dict = read_model_dict_from_s3(bucket_name=CRF_MODEL_S3_BUCKET_NAME, bucket_region=CRF_MODEL_S3_BUCKET_REGION, model_path_location=live_crf_model_path) ner_logger.debug('New Model dir %s path from cache' % live_crf_model_path) self.loaded_model_path = live_crf_model_path return self.initialize_tagger()
def __init__(self, entity_name, language=ENGLISH_LANG, timezone='UTC', past_date_referenced=False): """Initializes a DateDetector object with given entity_name and pytz timezone object Args: entity_name: A string by which the detected date entity substrings would be replaced with on calling detect_entity() timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object default is UTC past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso' """ self.text = '' self.tagged_text = '' self.processed_text = '' self.date = [] self.original_date_text = [] self.entity_name = entity_name self.tag = '__' + entity_name + '__' try: self.timezone = pytz.timezone(timezone) except Exception as e: ner_logger.debug('Timezone error: %s ' % e) self.timezone = pytz.timezone('UTC') ner_logger.debug('Default timezone passed as "UTC"') self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None self.language = language try: date_detector_module = importlib.import_module( 'ner_v2.detectors.temporal.date.{0}.date_detection'.format( self.language)) self.language_date_detector = date_detector_module.DateDetector( entity_name=self.entity_name, past_date_referenced=past_date_referenced, timezone=self.timezone) except ImportError: standard_date_regex = importlib.import_module( 'ner_v2.detectors.temporal.date.standard_date_regex') self.language_date_detector = standard_date_regex.DateDetector( entity_name=self.entity_name, data_directory_path=get_lang_data_path( detector_path=os.path.abspath(__file__), lang_code=self.language), timezone=self.timezone, past_date_referenced=past_date_referenced)
def _check_doc_type_for_elasticsearch(self): """ Checks if doc_type is present in connection settings, if not an exception is raised Raises: DataStoreSettingsImproperlyConfiguredException if doc_type was not found in connection settings """ if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings: ner_logger.debug( "No doc type is present in chatbot_ner.config.CHATBOT_NER_DATASTORE" ) raise DataStoreSettingsImproperlyConfiguredException( 'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment' )
def ner(request): """This functionality calls the run_ner() functionality to tag the message . It is called through api call Attributes: request: url parameters """ message = request.GET.get('message') entities_data = request.GET.get('entities', []) entities = [] if entities_data: entities = ast.literal_eval(entities_data) ner_logger.debug('Start: %s -- %s' % (message, entities)) output = run_ner(entities=entities, message=message) ner_logger.debug('Finished %s : %s ' % (message, output)) return HttpResponse(json.dumps({'data': output}), content_type='application/json')
def generate_crf_model_path(self): """ This method is used to generate the directory to store the entity along with the timestamp Returns: output_directory (str): The path where the model needs to be stored. """ file_path = CRF_MODELS_PATH + self.entity_name entity_path = CRF_MODELS_PATH + self.entity_name + '/' + self.entity_name entity_directory = os.path.dirname(entity_path) file_directory = os.path.dirname(entity_path) if not os.path.exists(entity_directory): os.makedirs(file_directory) ner_logger.debug('creating new directory %s' % file_path) output_directory_prefix = CRF_MODELS_PATH + self.entity_name + '/' output_directory_postfix = datetime.now().strftime("%d%m%Y-%H%M%S") return output_directory_prefix + self.entity_name + output_directory_postfix
def load_word_vectors_local(): """ Thus function is used to load the word_list and word_vectors from the specified paths. Returns: vocab (list): word_list present at the specified path. word_vectors (numpy.array): word_vectors present at the specified path. """ vocab = [] word_vectors = np.array([]) try: file_handler = open(CRF_EMBEDDINGS_PATH_VOCAB, 'rb') vocab = pickle.load(file_handler) file_handler = open(CRF_EMBEDDINGS_PATH_VECTORS, 'rb') word_vectors = np.array(pickle.load(file_handler)) except Exception as e: ner_logger.debug('Error in loading local word vectors %s' % e) return vocab, word_vectors
def get_model_output(self, entity_type, bot_message, user_message): """ This function is a calls all other in order get final json list of tagged data. If model has been loaded then it calls initialize_files(), add_data_to_tagger and run_crf to get the tagged data otherwise it will throw an error message """ output_list = [] if MODEL_RUN: self.initialize_files(CITY_ENTITY_TYPE) self.add_data_to_tagger(bot_message, user_message) crf_output = self.run_crf() if entity_type == CITY_ENTITY_TYPE: output_list = self.generate_city_output(crf_data=crf_output) else: ner_logger.debug('MODEL IS NOT RUNNING: CRFPP not installed') return output_list
def shopping_size(request): """This functionality calls the get_shopping_size() functionality to detect size. It is called through api call Attributes: request: url parameters """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_shopping_size(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for shopping_size: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def delete_entity_data_by_values(connection, index_name, doc_type, entity_name, values=None, **kwargs): """ Deletes entity data from ES for the specific entity depending on the values. Args: connection (elasticsearch.client.Elasticsearch): Elasticsearch client object index_name (str): The name of the index doc_type (str): The type of the documents that will be indexed entity_name (str): name of the entity for which the data is to be deleted. values (str, optional): List of values for which data is to be fetched. If None, all records are deleted Returns: None """ results = get_entity_data( connection=connection, index_name=index_name, doc_type=doc_type, entity_name=entity_name, values=values, **kwargs ) delete_bulk_queries = [] str_query = [] for record in results: delete_dict = { '_index': index_name, '_type': doc_type, '_id': record["_id"], '_op_type': 'delete', } str_query.append(delete_dict) if len(str_query) == constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE: delete_bulk_queries.append(str_query) str_query = [] if str_query: delete_bulk_queries.append(str_query) for delete_query in delete_bulk_queries: result = helpers.bulk(connection, delete_query, stats_only=True, **kwargs) ner_logger.debug('delete_entity_data_by_values: entity_name: {0} result {1}'.format(entity_name, str(result)))
def phone_number(request): """This functionality calls the get_phone_number() functionality to detect phone numbers. It is called through api call Attributes: request: url parameters """ entity_name = None try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % entity_name) entity_output = get_phone_number(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_STRUCTURED_VALUE_VERIFICATION], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_EXPERT_MESSAGE]) except Exception, e: entity_output = {} ner_logger.debug('Exception for phone_number: %s ' % e)
def __init__(self, entity_name, data_directory_path, timezone='UTC', range_enabled=False, form_check=False): """ Base Regex class which will be imported by language date class by giving their data folder path This will create standard regex and their parser to detect date for given language. Args: data_directory_path (str): path of data folder for given language timezone (str): user timezone default UTC """ self.text = '' self.tagged_text = '' self.processed_text = '' self.entity_name = entity_name self.tag = '__' + entity_name + '__' try: self.timezone = pytz.timezone(timezone) except Exception as e: ner_logger.debug('Timezone error: %s ' % e) self.timezone = pytz.timezone('UTC') ner_logger.debug('Default timezone passed as "UTC"') self.now_date = datetime.datetime.now(tz=self.timezone) self.bot_message = None # dict to store words for time, numerals and words which comes in reference to some date self.time_constant_dict = {} self.datetime_constant_dict = {} self.numerals_constant_dict = {} # define dynamic created standard regex for time from language data files self.regex_time = None # Method to initialise value in regex self.init_regex_and_parser(data_directory_path) # Variable to define default order in which these regex will work self.detector_preferences = [ self._detect_time_with_coln_format, self._detect_hour_minute ]