def location(request): """This functionality calls the get_location() functionality to detect location. It is called through api call Attributes: request: url parameters """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_location(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for location: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def shopping_size(request): """This functionality calls the get_shopping_size() functionality to detect size. It is called through api call Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_shopping_size(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for shopping_size: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def time_with_range(request): """This functionality calls the get_time_with_range() functionality to detect time. It is called through api call Args: request (django.http.request.HttpRequest): HttpRequest object Returns: response (django.http.response.HttpResponse): HttpResponse object """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_time_with_range(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME], parameters_dict[PARAMETER_STRUCTURED_VALUE], parameters_dict[PARAMETER_FALLBACK_VALUE], parameters_dict[PARAMETER_BOT_MESSAGE], parameters_dict[PARAMETER_TIMEZONE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except Exception as e: ner_logger.exception('Exception for time_with_range: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def transfer_entities(request): """ This method is used to transfer entities from the source to destination. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) entity_list = external_api_data.get(ENTITY_LIST) datastore_object = DataStore() datastore_object.transfer_entities_elastic_search( entity_list=entity_list) response['success'] = True except (IndexNotFoundException, InvalidESURLException, SourceDestinationSimilarException, InternalBackupException, AliasNotFoundException, PointIndexToAliasException, FetchIndexForAliasException, DeleteIndexFromAliasException, AliasForTransferException, IndexForTransferException, NonESEngineTransferException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def text(request): """This functionality initializes text detection functionality to detect textual entities. Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) fuzziness = parameters_dict[PARAMETER_FUZZINESS] min_token_len_fuzziness = parameters_dict[ PARAMETER_MIN_TOKEN_LEN_FUZZINESS] text_detector = TextDetector( entity_name=parameters_dict[PARAMETER_ENTITY_NAME], source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT]) ner_logger.debug('fuzziness: %s min_token_len_fuzziness %s' % (str(fuzziness), str(min_token_len_fuzziness))) if fuzziness: fuzziness = parse_fuzziness_parameter(fuzziness) text_detector.set_fuzziness_threshold(fuzziness) if min_token_len_fuzziness: min_token_len_fuzziness = int(min_token_len_fuzziness) text_detector.set_min_token_size_for_levenshtein( min_size=min_token_len_fuzziness) entity_output = text_detector.detect( message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def update_crf_training_data(request): """ This function is used to update the training data Args: request (HttpRequest): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. Example for data present in Post request body key: "external_api_data" value: {"sentence_list":["hello pratik","hello hardik"], "entity_list":[["pratik"], ["hardik"]], "entity_name":"training_try3", "language_script": "en"} """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, sentences=sentences) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def update_dictionary(request): """ This function is used to update the dictionary entities. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) entity_name = external_api_data.get(ENTITY_NAME) entity_data = external_api_data.get(ENTITY_DATA) language_script = external_api_data.get(LANGUAGE_SCRIPT) datastore_obj = DataStore() datastore_obj.update_entity_data(entity_name=entity_name, entity_data=entity_data, language_script=language_script) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def get_entity_word_variants(request): """ This function is used obtain the entity dictionary given the dictionary name. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : With data consisting of a list of value variants. """ response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) datastore_obj = DataStore() result = datastore_obj.get_entity_dictionary(entity_name=entity_name) structured_result = [] # The list around result.keys() is to make it compatible to python3 key_list = list(result.keys()) key_list.sort() for value in key_list: structured_result.append({'value': value, 'variants': result[value]}) result = structured_result response['result'] = result response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def get_crf_training_data(request): """ This function is used obtain the training data given the entity_name. Args: request (HttpRequest): HTTP response from url Returns: HttpResponse : With data consisting of a dictionary consisting of sentence_list and entity_list Examples: get request params key: "entity_name" value: "city" """ response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) languages = request.GET.get(LANGUAGES, '') languages = languages.split(',') if languages else [] result = DataStore().get_crf_data_for_entity_name(entity_name=entity_name, languages=languages) response['result'] = result response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def text(request): """This functionality initializes text detection functionality to detect textual entities. Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) fuzziness = parameters_dict[PARAMETER_FUZZINESS] min_token_len_fuzziness = parameters_dict[PARAMETER_MIN_TOKEN_LEN_FUZZINESS] read_model_from_s3 = json.loads(parameters_dict[PARAMETER_READ_MODEL_FROM_S3].lower()) read_embeddings_from_remote_url = json.loads(parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL].lower()) text_model_detector = TextModelDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], source_language_script=parameters_dict[PARAMETER_LANGUAGE_SCRIPT], read_model_from_s3=read_model_from_s3, read_embeddings_from_remote_url=read_embeddings_from_remote_url, live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH] ) ner_logger.debug('fuzziness: %s min_token_len_fuzziness %s' % (str(fuzziness), str(min_token_len_fuzziness))) if fuzziness: fuzziness = parse_fuzziness_parameter(fuzziness) text_model_detector.set_fuzziness_threshold(fuzziness) if min_token_len_fuzziness: min_token_len_fuzziness = int(min_token_len_fuzziness) text_model_detector.set_min_token_size_for_levenshtein(min_size=min_token_len_fuzziness) entity_output = text_model_detector.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def person_name(request): """This functionality calls the get_name() functionality to detect name. It is called through api call Attributes: request: url parameters """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_person_name(message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for person_name: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def translate_text(text, source_language_code, target_language_code=ENGLISH_LANG): """ Args: text (str): Text snippet which needs to be translated source_language_code (str): ISO-639-1 code for language script corresponding to text '' target_language_code (str): ISO-639-1 code for target language script Return: dict: Dictionary containing two keys corresponding to 'status'(bool) and 'translated text'(unicode) For example: Consider following example text: 'नमस्ते आप कैसे हैं' 'source_language_code': 'hi' 'target_language_code': 'en' translate_text(text, 'hi', 'en') >> {'status': True, 'translated_text': 'Hello how are you'} """ response = {TRANSLATED_TEXT: None, 'status': False} try: query_params = { "q": text, "format": "text", "source": source_language_code, "target": target_language_code } url = TRANSLATE_URL + "&" + unicode_urlencode(query_params) request = requests.get(url, timeout=2) if request.status_code == 200: translate_response = request.json() response[TRANSLATED_TEXT] = translate_response["data"][ "translations"][0]["translatedText"] response['status'] = True except Exception as e: ner_logger.exception('Exception while translation: %s ' % e) return response
def text(request): """ Run text detector with crf model on the 'message' passed in the request Args: request (django.http.HttpRequest): HTTP response from url Returns: dict: GET parameters from the request """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_text( message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], fuzziness=parameters_dict[PARAMETER_FUZZINESS], min_token_len_fuzziness=parameters_dict[ PARAMETER_MIN_TOKEN_LEN_FUZZINESS], live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[ PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], ) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def read_model_dict_from_s3(bucket_name, bucket_region, model_path_location=None): """ This method is used to read the model from S3 bucket and region specified. Args: bucket_name (str): name of the bucket to upload file to model_path_location (str): full path including filename on disk of the file to download bucket_region (str, Optional): region of the s3 bucket, defaults to None Returns: model_dict: Model from aws s3 """ model_dict = None try: s3 = boto3.resource('s3', region_name=bucket_region) bucket = s3.Bucket(bucket_name) pickle_file_handle = bucket.Object(model_path_location.lstrip('/')) # note read() will return str and hence cPickle.loads model_dict = pickle_file_handle.get()['Body'].read() ner_logger.debug("Model Read Successfully From s3") except Exception as e: ner_logger.exception("Error Reading model from s3 for domain %s " % e) return model_dict
def phone_number(request): """Uses PhoneDetector to detect phone numbers request params: message (list or str): string for get request and list of text for bulk call through post request on which detection logic is to be run entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. source_language (str): language for which the phone numbers have to be detected Returns: response (django.http.response.HttpResponse): HttpResponse object Examples: message = "Call 02226129857 and message +1(408) 92-124 and send 100rs to 91 9820334416 9920441344" entity_name = 'phone_number' structured_value = None fallback_value = None bot_message = None source_language = 'en' entity_output: [ { "detection": "message", "original_text": "91 9820334416", "entity_value": { "value": "919820334416" }, "language": "en" }, { "detection": "message", "original_text": "9920441344", "entity_value": { "value": "9920441344" }, "language": "en" }, { "detection": "message", "original_text": "02226129857", "entity_value": { "value": "02226129857" }, "language": "en" }, { "detection": "message", "original_text": "+1(408) 92-124", "entity_value": { "value": "140892124" }, "language": "en" } ] message = ["Call 02226129857' , 'message +1(408) 92-124' ,'send 100rs to 91 9820334416 9920441344'] entity_name = 'phone_number' source_language = 'en' entity_output: [ [{ "detection": "message", "original_text": "02226129857", "entity_value": { "value": "02226129857" }, "language": "en" } ], [ { "detection": "message", "original_text": "+1(408) 92-124", "entity_value": { "value": "140892124" }, "language": "en" } ], [ { "detection": "message", "original_text": "91 9820334416", "entity_value": { "value": "919820334416" }, "language": "en" }, { "detection": "message", "original_text": "9920441344", "entity_value": { "value": "9920441344" }, "language": "en" } ] ] """ try: parameters_dict = {} if request.method == "POST": parameters_dict = parse_post_request(request) ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) elif request.method == "GET": parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_name = parameters_dict[PARAMETER_ENTITY_NAME] language = parameters_dict[PARAMETER_SOURCE_LANGUAGE] ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) phone_number_detection = PhoneDetector( entity_name=entity_name, language=language, locale=parameters_dict[PARAMETER_LOCALE]) message = parameters_dict[PARAMETER_MESSAGE] entity_output = None ner_logger.debug(parameters_dict) if isinstance(message, six.string_types): entity_output = phone_number_detection.detect( message=message, structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) elif isinstance(message, (list, tuple)): entity_output = phone_number_detection.detect_bulk( messages=message) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for phone_number: %s ' % e) return HttpResponse(status=500) return JsonResponse({'data': entity_output})
def text(request): """ Run text detector with crf model on the 'message or list of messages' passed in the request Args: request (django.http.HttpRequest): HTTP response from url Returns: response (django.http.HttpResponse): HttpResponse object containing "entity_output" where "entity_output" is : list of dict: containing dict of detected entities with their original texts for a message OR list of lists: containing dict of detected entities with their original texts for each message in the list EXAMPLES: --- Single message >>> message = u'i want to order chinese from mainland china and pizza from domminos' >>> entity_name = 'restaurant' >>> structured_value = None >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'message', 'original_text': 'mainland china', 'entity_value': {'value': u'Mainland China'} }, { 'detection': 'message', 'original_text': 'domminos', 'entity_value': {'value': u"Domino's Pizza"} } ] >>> message = u'i wanted to watch movie' >>> entity_name = 'movie' >>> structured_value = u'inferno' >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'} } ] >>> message = u'i wanted to watch inferno' >>> entity_name = 'movie' >>> structured_value = u'delhi' >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'} } ] --- Bulk detection >>> message = [u'book a flight to mumbai', u'i want to go to delhi from mumbai'] >>> entity_name = u'city' >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ [ { 'detection': 'message', 'entity_value': {'value': u'mumbai'}, 'original_text': u'mumbai' } ], [ { 'detection': 'message', 'entity_value': {'value': u'New Delhi'}, 'original_text': u'delhi' }, { 'detection': 'message', 'entity_value': {'value': u'mumbai'}, 'original_text': u'mumbai' } ] ] """ try: parameters_dict = {} if request.method == "POST": parameters_dict = parse_post_request(request) ner_logger.debug('Start Bulk Detection: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) elif request.method == "GET": parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_output = get_text( message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], fuzziness=parameters_dict[PARAMETER_FUZZINESS], min_token_len_fuzziness=parameters_dict[PARAMETER_MIN_TOKEN_LEN_FUZZINESS], live_crf_model_path=parameters_dict[PARAMETER_LIVE_CRF_MODEL_PATH], read_model_from_s3=parameters_dict[PARAMETER_READ_MODEL_FROM_S3], read_embeddings_from_remote_url=parameters_dict[PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL], predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for text_synonym: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def text(request): """ Uses TextDetector to the get the values of multiple text entity detection. This is used for both single text message or multiple text message detection. Currently only POST method is supported. Args: request: request for text detection Request parameters message (list of str): list of message string for which detection logic needs to be run on. source_language (str): language for which the phone numbers have to be detected bot_message (str): previous message from a bot/agent. entities (dict): dictionary of entties to be detected, each entity dict will contain following details: entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): [Optional] Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): [Optional] If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. use_fallback (bool): Default as False, if this is present for a single message fallback value will be used. fuzziness (int): [Optional] Fuzziness value for each entity min_token_size_for_fuzziness (int): [Optional] minimum size for token match Returns: response (django.http.response.HttpResponse): HttpResponse object Examples: 1) For single message: input request: { "message": ["I want to go to Jabalpur"], "bot_message": null, "language_script": "en", "source_language": "en", "entities": { "city": { "structured_value": "Delhi", "fallback_value": null, "predetected_values": ["Mumbai"], "fuzziness": null, "min_token_len_fuzziness": null, "use_fallback": false }, "restaurant": { "structured_value": null, "fallback_value": null, "predetected_values": null, "fuzziness": null, "min_token_len_fuzziness": null, "use_fallback": false } } } output response: { "success": true, "error": null, "data": [ { "entities": { "restaurant": [], "city": [ { "entity_value": { "value": "New Delhi", "datastore_verified": true, "model_verified": false }, "detection": "structure_value_verified", "original_text": "delhi", "language": "en" }, { "entity_value": { "value": "Mumbai", "datastore_verified": false, "model_verified": true }, "detection": "structure_value_verified", "original_text": "Mumbai", "language": "en" } ] }, "language": "en" } ] } """ data = [] if request.method == "GET": response = {"success": False, "error": "Get method is not allowed"} return JsonResponse(response, status=405) elif request.method == "POST": ner_logger.debug("Fetching result") try: verify_text_request(request) # if verify success get detection data data = get_text_entity_detection_data(request) except KeyError as err: response = {"success": False, "error": str(err)} # TODO: move to ner_logger.error ner_logger.exception(response) return JsonResponse(response, status=400) except TypeError as err: response = {"success": False, "error": str(err)} ner_logger.exception(response) return JsonResponse(response, status=400) except Exception as err: response = {"success": False, "error": str(err)} ner_logger.exception(response) return JsonResponse(response, status=500) if data: response = {"success": True, "error": None, "data": data} return JsonResponse(response, status=200) else: response = {"success": False, "error": "Some error while parsing"} return JsonResponse(response, status=500)
def train_crf_model(request): """ This method is used to train crf model. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. Post Request Body: key: "external_api_data" value: { "entity_name": "crf_test", "read_model_from_s3": true, "es_config": true, "read_embeddings_from_remote_url": true } """ response = {"success": False, "error": "", "result": {}} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) entity_name = external_api_data.get(ENTITY_NAME) read_model_from_s3 = external_api_data.get(READ_MODEL_FROM_S3) es_config = external_api_data.get(ES_CONFIG) read_embeddings_from_remote_url = external_api_data.get( READ_EMBEDDINGS_FROM_REMOTE_URL) crf_model = CrfTrain( entity_name=entity_name, read_model_from_s3=read_model_from_s3, read_embeddings_from_remote_url=read_embeddings_from_remote_url) if es_config: model_path = crf_model.train_model_from_es_data() else: sentence_list = external_api_data.get(SENTENCE_LIST) entity_list = external_api_data.get(ENTITY_LIST) model_path = crf_model.train_crf_model_from_list( sentence_list=sentence_list, entity_list=entity_list) response['result'] = {LIVE_CRF_MODEL_PATH: model_path} response['success'] = True except (IndexNotFoundException, InvalidESURLException, SourceDestinationSimilarException, InternalBackupException, AliasNotFoundException, PointIndexToAliasException, FetchIndexForAliasException, DeleteIndexFromAliasException, AliasForTransferException, IndexForTransferException, NonESEngineTransferException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def _get_substring_from_processed_text(text, matched_tokens): """ Get part of original text that was detected as some entity value. This method was written to tackle cases when original text contains special characters which are dropped during tokenization Args: matched_tokens (list): list of tokens (usually tokens from fuzzy match results from ES) to find as a contiguous substring in the processed sentence considering the effects of tokenizer text (string or unicode): sentence from self.processed_text from where indices of given token will be given Returns: str or unicode: part of original text that corresponds to given tokens E.g. self.processed_text = u'i want to order 1 pc hot & crispy' tokens = [u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'] indices = [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) In: matched_tokens = [u'1', u'pc', u'hot', u'crispy'] Out: 1 pc hot & crispy Notice that & is dropped during tokenization but when finding original text, we recover it from processed text """ def _get_tokens_and_indices(txt): """ Args: txt (str or unicode): text to get tokens from and indicies of those tokens in the given text Returns: tuple: list: containing tokens, direct results from tokenizer.tokenize list: containing (int, int) indicating start and end position of ith token (of first list) in given text E.g. In: text = u'i want to order 1 pc hot & crispy' Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) """ txt = txt.rstrip() + ' __eos__' processed_text_tokens = TOKENIZER.tokenize(txt) processed_text_tokens_indices = [] offset = 0 for token in processed_text_tokens: st = txt.index(token) en = st + len(token) # Small block to handle tricky cases like '(A B) C' # It extends the previous token's end boundary if there are special characters except whitespace # towards the end of previous token prefix = txt[:en] prefix_tokens = whitespace_tokenizer.tokenize(prefix) if prefix and len(prefix_tokens) > 1 and prefix_tokens[0]: if processed_text_tokens_indices: s, e = processed_text_tokens_indices.pop() e += len(prefix_tokens[0]) processed_text_tokens_indices.append((s, e)) txt = txt[en:] processed_text_tokens_indices.append( (offset + st, offset + en)) offset += en # remove eos parts processed_text_tokens.pop() processed_text_tokens_indices.pop() return processed_text_tokens, processed_text_tokens_indices try: n = len(matched_tokens) tokens, indices = _get_tokens_and_indices(text) for i in range(len(tokens) - n + 1): if tokens[i:i + n] == matched_tokens: start = indices[i][0] end = indices[i + n - 1][1] return text[start:end] except (ValueError, IndexError): ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, text)) return u' '.join(matched_tokens)
def number_range(request): """Use NumberDetector to detect numerals Args: request: url parameters: request params: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. unit_type(str): restrict number range to detect for some unit types like 'currency', 'temperature' Returns: HttpResponse: Response containing dictionary having containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity Examples: message = "we expect 200-300 people in room" entity_name = 'people_range' structured_value = None fallback_value = None bot_message = None unit_type=None output = number_range(request) print output >> [{'detection': 'message', 'original_text': '200-300', 'entity_value': {'min_value': '200', 'max_value': '300', 'unit': None}}] """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) number_range_detector = NumberRangeDetector( entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE]) entity_output = number_range_detector.detect( message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug( 'Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def date(request): """This functionality use DateAdvanceDetector to detect date. It is called through api call Args: request (django.http.request.HttpRequest): HttpRequest object request params: message (str): natural text on which detection logic is to be run. Note if structured value is present detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is present detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. timezone (str): timezone of the user source_language (str): source language code (ISO 639-1) language_script (str): language code of script (ISO 639-1) Returns: response (django.http.response.HttpResponse): HttpResponse object Example: message = "agle mahine k 5 tarikh ko mera birthday hai" entity_name = 'time' structured_value = None fallback_value = None bot_message = None timezone = 'UTC' source_language = 'hi' language_script = 'en' output = date(request) print output >> [{'detection': 'message', 'original_text': 'agle mahine k 5 tarikh', 'entity_value': {'value': {'mm': 12, 'yy': 2018, 'dd': 5, 'type': 'date'}}}] """ try: parameters_dict = get_parameters_dictionary(request) timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) date_past_reference = parameters_dict.get(PARAMETER_PAST_DATE_REFERENCED, "false") past_date_referenced = date_past_reference == 'true' or date_past_reference == 'True' date_detection = DateAdvancedDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, past_date_referenced=past_date_referenced) date_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) entity_output = date_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for date: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def phone_number(request): """Uses PhoneDetector to detect phone numbers request params: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. source_language (str): language for which the phone numbers have to be detected Returns: response (django.http.response.HttpResponse): HttpResponse object Examples: message = "Call 02226129857 and message +1(408) 92-124 and send 100rs to 91 9820334416 9920441344" entity_name = 'phone_number' structured_value = None fallback_value = None bot_message = None source_language = 'en' entity_output: [ { "detection": "message", "original_text": "91 9820334416", "entity_value": { "value": "919820334416" }, "language": "en" }, { "detection": "message", "original_text": "9920441344", "entity_value": { "value": "9920441344" }, "language": "en" }, { "detection": "message", "original_text": "02226129857", "entity_value": { "value": "02226129857" }, "language": "en" }, { "detection": "message", "original_text": "+1(408) 92-124", "entity_value": { "value": "140892124" }, "language": "en" } ] """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) entity_name = parameters_dict[PARAMETER_ENTITY_NAME] language = parameters_dict[PARAMETER_SOURCE_LANGUAGE] ner_logger.debug('Entity Name %s' % entity_name) ner_logger.debug('Source Language %s' % language) phone_number_detection = PhoneDetector(entity_name=entity_name, language=language) entity_output = phone_number_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for phone_number: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def number(request): """Use NumberDetector to detect numerals Attributes: request: url parameters: request params: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. unit_type(str): restrict number range to detect for some unit types like 'currency', 'temperature' min_digit (str): min digit max_digit (str): max digit Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity Example: message = "I want to purchase 30 units of mobile and 40 units of Television" entity_name = 'number_of_unit' structured_value = None fallback_value = None bot_message = None unit_type = None output = get_number(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, min_digit=1, max_digit=2) print output >> [{'detection': 'message', 'original_text': '30', 'entity_value': {'value': '30', 'unit': None}}, {'detection': 'message', 'original_text': '40', 'entity_value': {'value': '40', 'unit': None}}] message = "I want to reserve a table for 3 people" entity_name = 'number_of_people' structured_value = None fallback_value = None bot_message = None unit_type = None min_digit=1 max_digit=6 output = number(request) print output >> [{'detection': 'message', 'original_text': 'for 3 people', 'entity_value': {'value': '3', 'unit': 'people'}}] """ try: parameters_dict = get_parameters_dictionary(request) ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) number_detection = NumberDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE]) if parameters_dict[PARAMETER_MIN_DIGITS] and parameters_dict[PARAMETER_MAX_DIGITS]: min_digit = int(parameters_dict[PARAMETER_MIN_DIGITS]) max_digit = int(parameters_dict[PARAMETER_MAX_DIGITS]) number_detection.set_min_max_digits(min_digit=min_digit, max_digit=max_digit) entity_output = number_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for numeric: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def time(request): """This functionality use TimeDetector to detect time. It is called through api call Args: request (django.http.request.HttpRequest): HttpRequest object request params: message (str): natural text on which detection logic is to be run. Note if structured value is present detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is present detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. timezone (str): timezone of the user source_language (str): source language code (ISO 639-1) language_script (str): language code of script (ISO 639-1) Returns: response (django.http.response.HttpResponse): HttpResponse object Example: message = "kal subah 5 baje mujhe jaga dena" entity_name = 'time' structured_value = None fallback_value = None bot_message = None timezone = 'UTC' source_language = 'hi' language_script = 'en' output = time(request) print output >> [{'detection': 'message', 'original_text': '12:30 pm', 'entity_value': {'mm': 30, 'hh': 12, 'nn': 'pm'}}] """ try: parameters_dict = get_parameters_dictionary(request) timezone = parameters_dict[PARAMETER_TIMEZONE] or 'UTC' form_check = True if parameters_dict[PARAMETER_STRUCTURED_VALUE] else False ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME]) time_detection = TimeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], timezone=timezone, form_check=form_check) time_detection.set_bot_message(bot_message=parameters_dict[PARAMETER_BOT_MESSAGE]) entity_output = time_detection.detect(message=parameters_dict[PARAMETER_MESSAGE], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE]) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except TypeError as e: ner_logger.exception('Exception for time: %s ' % e) return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def text(request): """ Run text detector with crf model on the 'message or list of messages' passed in the request Args: request (django.http.HttpRequest): HTTP response from url Returns: response (django.http.HttpResponse): HttpResponse object containing "entity_output" where "entity_output" is : list of dict: containing dict of detected entities with their original texts for a message OR list of lists: containing dict of detected entities with their original texts for each message in the list EXAMPLES: --- Single message >>> message = u'i want to order chinese from mainland china and pizza from domminos' >>> entity_name = 'restaurant' >>> structured_value = None >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'message', 'original_text': 'mainland china', 'entity_value': {'value': u'Mainland China'} }, { 'detection': 'message', 'original_text': 'domminos', 'entity_value': {'value': u"Domino's Pizza"} } ] >>> message = u'i wanted to watch movie' >>> entity_name = 'movie' >>> structured_value = u'inferno' >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'structure_value_verified', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'} } ] >>> message = u'i wanted to watch inferno' >>> entity_name = 'movie' >>> structured_value = u'delhi' >>> fallback_value = None >>> bot_message = None >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ { 'detection': 'message', 'original_text': 'inferno', 'entity_value': {'value': u'Inferno'} } ] --- Bulk detection >>> message = [u'book a flight to mumbai', u'i want to go to delhi from mumbai'] >>> entity_name = u'city' >>> entity_output = get_text(message=message, >>> entity_name=entity_name, >>> structured_value=structured_value, >>> fallback_value=fallback_value, >>> bot_message=bot_message) >>> print(entity_output) [ [ { 'detection': 'message', 'entity_value': {'value': u'mumbai'}, 'original_text': u'mumbai' } ], [ { 'detection': 'message', 'entity_value': {'value': u'New Delhi'}, 'original_text': u'delhi' }, { 'detection': 'message', 'entity_value': {'value': u'mumbai'}, 'original_text': u'mumbai' } ] ] """ try: parameters_dict = parse_parameters_from_request(request) entity_output = get_text( message=parameters_dict[PARAMETER_MESSAGE], entity_name=parameters_dict[PARAMETER_ENTITY_NAME], structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE], fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE], bot_message=parameters_dict[PARAMETER_BOT_MESSAGE], language=parameters_dict[PARAMETER_SOURCE_LANGUAGE], fuzziness=parameters_dict[PARAMETER_FUZZINESS], min_token_len_fuzziness=parameters_dict[PARAMETER_MIN_TOKEN_LEN_FUZZINESS], predetected_values=parameters_dict[PARAMETER_PRIOR_RESULTS] ) ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output)) except DataStoreRequestException as err: ner_logger.exception(f"Error in requesting ES {request.path}, error: {err}, query: {err.request}," f" response: {err.response}") return HttpResponse(status=500) except es_exceptions.ConnectionTimeout as err: ner_logger.exception(f"Error in text_synonym for: {request.path}, error: {err}") return HttpResponse(status=500) except es_exceptions.ConnectionError as err: ner_logger.exception(f"Error in text_synonym for: {request.path}, error: {err}") return HttpResponse(status=500) except (TypeError, KeyError) as err: ner_logger.exception(f"Error in text_synonym for: {request.path}, error: {err}") return HttpResponse(status=500) return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
def verify_text_request(request): """ Check the request object 1. If proper message or entity is present in required format. 2. If length of message or entity is in allowed range Args: request: API request object Returns: Raises KeyError if message or entities are not present Raises TypeError if message is not list or entities is not dict type Else Return none """ request_data = json.loads(request.body) messages = request_data.get("messages") entities = request_data.get("entities") if not messages: ner_logger.exception("messages param is not passed") raise KeyError("key messages is required") if not entities: ner_logger.exception("Entities param is not passed") raise KeyError("Entities dict is required") if not isinstance(messages, list): ner_logger.exception("messages param is not in correct format") raise TypeError("messages should be in format of list of string") if not isinstance(entities, dict): ner_logger.exception("Entities param is not in correct format") raise TypeError("Entities should be dict of entity details") if len(messages) > MAX_NUMBER_BULK_MESSAGE: ner_logger.exception(f"Maximum number of message can be {MAX_NUMBER_BULK_MESSAGE} for " "bulk detection") raise ValueError(f"Maximum number of message can be {MAX_NUMBER_BULK_MESSAGE} for " "bulk detection") if len(list(entities)) > MAX_NUMBER_MULTI_ENTITIES: ner_logger.exception(f"Maximum number of entities can be {MAX_NUMBER_MULTI_ENTITIES} for " " detection") raise ValueError(f"Maximum number of entities can be {MAX_NUMBER_MULTI_ENTITIES} for " "bulk detection")
def _get_substring_from_processed_text(self, matched_tokens): """ Get part of original text that was detected as some entity value. This method was written to tackle cases when original text contains special characters which are dropped during tokenization Args: matched_tokens (list): list of tokens (usually tokens from fuzzy match results from ES) to find as a contiguous substring in the processed text considering the effects of tokenizer Returns: str or unicode: part of original text that corresponds to given tokens E.g. self.processed_text = u'i want to order 1 pc hot & crispy' tokens = [u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'] indices = [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) In: matched_tokens = [u'1', u'pc', u'hot', u'crispy'] Out: 1 pc hot & crispy Notice that & is dropped during tokenization but when finding original text, we recover it from processed text """ def _get_tokens_and_indices(text): """ Args: text (str or unicode): text to get tokens from and indicies of those tokens in the given text Returns: tuple: list: containing tokens, direct results from tokenizer.tokenize list: containing (int, int) indicating start and end position of ith token (of first list) in given text E.g. In: text = u'i want to order 1 pc hot & crispy' Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) """ processed_text_tokens = TOKENIZER.tokenize(text) processed_text_tokens_indices = [] offset = 0 txt = text for token in processed_text_tokens: st = txt.index(token) en = st + len(token) txt = txt[en:] processed_text_tokens_indices.append((offset + st, offset + en)) offset += en return processed_text_tokens, processed_text_tokens_indices try: n = len(matched_tokens) tokens, indices = _get_tokens_and_indices(self.processed_text) for i in range(len(tokens) - n + 1): if tokens[i:i + n] == matched_tokens: start = indices[i][0] end = indices[i + n - 1][1] return self.processed_text[start:end] except (ValueError, IndexError): ner_logger.exception('Error getting original text (%s, %s)' % (matched_tokens, self.processed_text)) return u' '.join(matched_tokens)