Exemplo n.º 1
0
def city(request):
    """This functionality calls the get_city() functionality to detect city. It is called through api call

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = parse_parameters_from_request(request)
        entity_output = get_city(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
                                 parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                 parameters_dict[PARAMETER_FALLBACK_VALUE],
                                 parameters_dict[PARAMETER_BOT_MESSAGE],
                                 parameters_dict[PARAMETER_SOURCE_LANGUAGE]
                                 )
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for city: %s ' % e)
        return HttpResponse(status=500)
    except KeyError as e:
        ner_logger.exception('Exception for text_synonym: %s ' % e)
        return HttpResponse(status=500)
    except es_exceptions.ConnectionTimeout as e:
        ner_logger.exception('Exception for text_synonym: %s ' % e)
        return HttpResponse(status=500)
    except es_exceptions.ConnectionError as e:
        ner_logger.exception('Exception for text_synonym: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
Exemplo n.º 2
0
def regex(request):
    """This functionality calls the get_regex() functionality to detect text those abide by the specified regex.
    It is called through api call

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = parse_parameters_from_request(request)
        entity_output = get_regex(parameters_dict[PARAMETER_MESSAGE],
                                  parameters_dict[PARAMETER_ENTITY_NAME],
                                  parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                  parameters_dict[PARAMETER_FALLBACK_VALUE],
                                  parameters_dict[PARAMETER_BOT_MESSAGE],
                                  parameters_dict[PARAMETER_REGEX],
                                  parameters_dict[PARAMETER_ASR],
                                  parameters_dict[PARAMETER_SOURCE_LANGUAGE]
                                  )
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for regex: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
Exemplo n.º 3
0
    def get_crf_data_for_entity_name(self, entity_name, **kwargs):
        """
        This method is used to obtain the sentences and entities from sentences given entity name
        Args:
            entity_name (str): Entity name for which training data needs to be obtained
            kwargs:
                For Elasticsearch:
                    Refer https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
        Returns:
            results_dictionary(dict): Dictionary consisting of the training data for the the given entity.

        Raises:
             IndexNotFoundException if es_training_index was not found in connection settings

        Example:
            db = Datastore()
            db.get_entity_training_data(entity_name, **kwargs):
            >> {
        'sentence_list': [
            'My name is hardik',
            'This is my friend Ajay'
                        ],
        'entity_list': [
            [
                'hardik'
            ],
            [
                'Ajay'
            ]
                        ]
            }
        """
        ner_logger.debug(
            'Datastore, get_entity_training_data, entity_name %s' %
            entity_name)
        if self._client_or_connection is None:
            self._connect()
        results_dictionary = {}
        if self._engine == ELASTICSEARCH:
            es_training_index = self._connection_settings.get(
                ELASTICSEARCH_CRF_DATA_INDEX_NAME)
            if es_training_index is None:
                raise IndexNotFoundException(
                    'Index for ELASTICSEARCH_CRF_DATA_INDEX_NAME not found. '
                    'Please configure the same')
            self._check_doc_type_for_crf_data_elasticsearch()
            request_timeout = self._connection_settings.get(
                'request_timeout', 20)
            results_dictionary = elastic_search.query.get_crf_data_for_entity_name(
                connection=self._client_or_connection,
                index_name=es_training_index,
                doc_type=self.
                _connection_settings[ELASTICSEARCH_CRF_DATA_DOC_TYPE],
                entity_name=entity_name,
                request_timeout=request_timeout,
                **kwargs)
            ner_logger.debug(
                'Datastore, get_entity_training_data, results_dictionary %s' %
                str(entity_name))
        return results_dictionary
Exemplo n.º 4
0
    def detect_regex(self):
        """
        Detects text based on the aforementioned regex
        Raises an error for an invalid regex
        Returns:
                (regex_list, original_list) (tuple):
                regex_list (list) : list of detected text for the specified text
                original_list (list) : list of original text provided by the user
        Example:
            self.regex = r'\d+'
            self.text = 'aman123"

            detect_entity()
            >> (['123'], ['123'])
        """
        original_list = []
        regex_list = []
        try:
            compiled_regex = re.compile(self.regex)
            regex_list.append(compiled_regex.findall(self.text)[0])
            original_list.extend(regex_list)
            self.update_processed_text(regex_list)
        except Exception as e:
            ner_logger.debug("Exception detect regex: %s" % e.message)
        return regex_list, original_list
Exemplo n.º 5
0
def get_parameters_dictionary(request):
    # type: (django.http.HttpRequest) -> Dict[str, Any]
    """
    Extract GET parameters from HTTP request

    Args:
        request (django.http.HttpRequest): HTTP response from url

    Returns:
       dict: GET parameters from the request
    """
    parameters_dict = {
        PARAMETER_MESSAGE: request.GET.get('message'),
        PARAMETER_ENTITY_NAME: request.GET.get('entity_name'),
        PARAMETER_STRUCTURED_VALUE: request.GET.get('structured_value'),
        PARAMETER_FALLBACK_VALUE: request.GET.get('fallback_value'),
        PARAMETER_BOT_MESSAGE: request.GET.get('bot_message'),
        PARAMETER_TIMEZONE: request.GET.get('timezone'),
        PARAMETER_REGEX: request.GET.get('regex'),
        PARAMETER_ASR: request.GET.get('is_asr', False),
        PARAMETER_LANGUAGE_SCRIPT: request.GET.get('language_script', ENGLISH_LANG),
        PARAMETER_SOURCE_LANGUAGE: request.GET.get('source_language', ENGLISH_LANG),
        PARAMETER_FUZZINESS: request.GET.get('fuzziness'),
        PARAMETER_MIN_TOKEN_LEN_FUZZINESS: request.GET.get('min_token_len_fuzziness'),
        PARAMETER_MIN_DIGITS: request.GET.get('min_number_digits'),
        PARAMETER_MAX_DIGITS: request.GET.get('max_number_digits'),
        PARAMETER_PRIOR_RESULTS: json.loads(request.GET.get("predetected_values", '[]'))
    }
    ner_logger.debug("parameters dict - {}".format(parameters_dict))
    return parameters_dict
Exemplo n.º 6
0
    def train_crf_model_from_list(self,
                                  sentence_list,
                                  entity_list,
                                  c1=0,
                                  c2=0,
                                  max_iterations=1000):
        """
        This model is used to train the crf model. It performs the pre processing steps
        and trains the models
        Args:
            c1 (int): Coefficient of regularization to control variance and bias.
            c2 (int): Coefficient of regularization to control variance and bias.
            max_iterations (int): Max number of iterations to be carried out.
            sentence_list (list): List of sentences on which the NER task has to be carried out.
            entity_list (list): List of entities present in each sentence of the text_list.
        Returns:
            status (bool): Returns true if the training is successful.
        """

        ner_logger.debug('Pre processing for Entity: %s started' %
                         self.entity_name)
        x, y = CrfPreprocessData.preprocess_crf_text_entity_list(
            sentence_list=sentence_list,
            entity_list=entity_list,
            read_embeddings_from_remote_url=self.
            read_embeddings_from_remote_url)
        ner_logger.debug('Pre processing for Entity: %s completed' %
                         self.entity_name)
        model_path = self.train_crf_model(x, y, c1, c2, max_iterations)
        return model_path
Exemplo n.º 7
0
def budget(request):
    """This functionality calls the get_budget() functionality to detect budget. It is called through api call

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])
        entity_output = get_budget(parameters_dict[PARAMETER_MESSAGE],
                                   parameters_dict[PARAMETER_ENTITY_NAME],
                                   parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                   parameters_dict[PARAMETER_FALLBACK_VALUE],
                                   parameters_dict[PARAMETER_BOT_MESSAGE],
                                   parameters_dict[PARAMETER_MIN_DIGITS],
                                   parameters_dict[PARAMETER_MAX_DIGITS])
        ner_logger.debug(
            'Finished %s : %s ' %
            (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for budget: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}),
                        content_type='application/json')
    def test_number_range_detection_from_csv(self):
        df = pd.read_csv(self.csv_path,
                         encoding='utf-8',
                         keep_default_na=False)

        for language, language_tests_df in df.groupby(by=['language']):
            ner_logger.debug('Running tests for language {}'.format(language))
            for index, row in language_tests_df.iterrows():
                message = row['message']
                unit_type = None if row['unit_type'] == 'NA' else row[
                    'unit_type']
                number_range_detector = NumberRangeDetector(
                    entity_name='number_range',
                    language=language,
                    unit_type=unit_type)
                expected_entity_values_list, expected_original_texts_list = \
                    self._make_expected_output(row['min_value'], row['max_value'], row['unit'], row['original_text'])

                expected_zipped = list(
                    zip(expected_entity_values_list,
                        expected_original_texts_list))

                detected_entities_values_list, detected_original_texts_list = \
                    number_range_detector.detect_entity(message)

                for detected_number_range in zip(detected_entities_values_list,
                                                 detected_original_texts_list):
                    self.assertIn(detected_number_range, expected_zipped)
Exemplo n.º 9
0
def get_timezone(timezone, ignore_errors=True):
    # type: (Union[datetime.tzinfo, str, unicode], bool) -> datetime.tzinfo
    """
    Return a datetime.tzinfo (pytz timezone object). If `timezone` is a str, try constructing a pytz
    timezone object with it. If an invalid timezone is mentioned and `ignore_errors` is True, an UTC timezone object
    will be returned. If `timezone` is already a datetime.tzinfo object it will be returned as is

    Args:
        timezone (str or datetime.tzinfo): Either a valid timezone string or datetime.tzinfo object
        ignore_errors (bool, optional): when set to True, ignore errors and return a pytz.UTC when error occurs. When
            set to False, raise exception when invalid timezone is given. Defaults to True.

    Returns:
        datetime.tzinfo: A pytz timezone object

    """
    if (not isinstance(timezone, six.string_types)
            and isinstance(timezone, tzinfo)
            and hasattr(timezone, 'localize')):
        return timezone

    try:
        timezone = pytz.timezone(timezone)
    except Exception as e:
        if ignore_errors:
            ner_logger.debug('Timezone error: %s ' % e)
            timezone = pytz.timezone('UTC')
            ner_logger.debug('Using "UTC" as default timezone')
        else:
            raise
    return timezone
Exemplo n.º 10
0
def person_name(request):
    """This functionality calls the get_name() functionality to detect name. It is called through api call

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])
        entity_output = get_person_name(
            message=parameters_dict[PARAMETER_MESSAGE],
            entity_name=parameters_dict[PARAMETER_ENTITY_NAME],
            structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE],
            fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE],
            bot_message=parameters_dict[PARAMETER_BOT_MESSAGE],
            language=parameters_dict[PARAMETER_SOURCE_LANGUAGE])
        ner_logger.debug(
            'Finished %s : %s ' %
            (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for person_name: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}),
                        content_type='application/json')
Exemplo n.º 11
0
    def initialize_files(self, entity_type):
        """
        This function checks the type of entity.
        We have currently done it for entity_type='city'.
        If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the
        tagger and model_path accordingly

        Args:
            entity_type: type of entity

        """
        global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT
        if entity_type == CITY_ENTITY_TYPE:
            self._model_path = CITY_MODEL_PATH
            if not CITY_MODEL_OBJECT:
                CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('CITY CRF model loaded %s' % self._model_path)

            self.tagger = CITY_MODEL_OBJECT
        elif entity_type == DATE_ENTITY_TYPE:
            self._model_path = DATE_MODEL_PATH
            if not DATE_MODEL_OBJECT:
                DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('date CRF model loaded %s' % self._model_path)

            self.tagger = DATE_MODEL_OBJECT
Exemplo n.º 12
0
def parse_post_request(request):
    # type: (django.http.HttpRequest) -> Dict[str, Any]
    """
    Extract POST request body from HTTP request

    Args:
        request (django.http.HttpRequest): HTTP response from url

    Returns:
       dict: parameters from the request
    """
    request_data = json.loads(request.body)
    parameters_dict = {
        PARAMETER_MESSAGE: request_data.get('message'),
        PARAMETER_ENTITY_NAME: request_data.get('entity_name'),
        PARAMETER_STRUCTURED_VALUE: request_data.get('structured_value'),
        PARAMETER_FALLBACK_VALUE: request_data.get('fallback_value'),
        PARAMETER_BOT_MESSAGE: request_data.get('bot_message'),
        PARAMETER_TIMEZONE: request_data.get('timezone'),
        PARAMETER_REGEX: request_data.get('regex'),
        PARAMETER_LANGUAGE_SCRIPT: request_data.get('language_script', ENGLISH_LANG),
        PARAMETER_SOURCE_LANGUAGE: request_data.get('source_language', ENGLISH_LANG),
        PARAMETER_FUZZINESS: request_data.get('fuzziness'),
        PARAMETER_MIN_TOKEN_LEN_FUZZINESS: request_data.get('min_token_len_fuzziness'),
        PARAMETER_MIN_DIGITS: request_data.get('min_number_digits'),
        PARAMETER_MAX_DIGITS: request_data.get('max_number_digits'),
        PARAMETER_READ_EMBEDDINGS_FROM_REMOTE_URL: to_bool(request_data.get('read_embeddings_from_remote_url')),
        PARAMETER_READ_MODEL_FROM_S3: to_bool(request_data.get('read_model_from_s3')),
        PARAMETER_LIVE_CRF_MODEL_PATH: request_data.get('live_crf_model_path'),
        PARAMETER_PRIOR_RESULTS: request_data.get("predetected_values", [])
    }

    ner_logger.debug("parameters dict - {}".format(parameters_dict))

    return parameters_dict
Exemplo n.º 13
0
def passenger_count(request):
    """This functionality calls the get_passenger_count() functionality to detect passenger count.
    It is called through api call

    Args:
        request (django.http.request.HttpRequest): HttpRequest object

    Returns:
        response (django.http.response.HttpResponse): HttpResponse object

    """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])
        entity_output = get_passenger_count(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
                                            parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                            parameters_dict[PARAMETER_FALLBACK_VALUE],
                                            parameters_dict[PARAMETER_BOT_MESSAGE]
                                            )
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for passenger count: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
Exemplo n.º 14
0
def number_range(request):
    """Use NumberDetector to detect numerals

        Args:
            request: url parameters:

            request params:
                message (str): natural text on which detection logic is to be run. Note if structured value is
                                       detection is run on structured value instead of message
                entity_name (str): name of the entity. Also acts as elastic-search dictionary name
                                  if entity uses elastic-search lookup
                structured_value (str): Value obtained from any structured elements. Note if structured value is
                                       detection is run on structured value instead of message
                                       (For example, UI elements like form, payload, etc)
                fallback_value (str): If the detection logic fails to detect any value either from structured_value
                                 or message then we return a fallback_value as an output.
                bot_message (str): previous message from a bot/agent.
                unit_type(str): restrict number range to detect for some unit types like 'currency', 'temperature'


       Returns:
           HttpResponse: Response containing dictionary having containing entity_value, original_text and detection;
                         entity_value is in itself a dict with its keys varying from entity to entity

       Examples:
           message = "we expect 200-300 people in room"
           entity_name = 'people_range'
           structured_value = None
           fallback_value = None
           bot_message = None
           unit_type=None
           output = number_range(request)
           print output

           >> [{'detection': 'message', 'original_text': '200-300', 'entity_value': {'min_value': '200',
                'max_value': '300', 'unit': None}}]
       """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])

        number_range_detector = NumberRangeDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME],
                                                    language=parameters_dict[PARAMETER_SOURCE_LANGUAGE],
                                                    unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE])

        entity_output = number_range_detector.detect(message=parameters_dict[PARAMETER_MESSAGE],
                                                     structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                                     fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE],
                                                     bot_message=parameters_dict[PARAMETER_BOT_MESSAGE])

        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))

    except TypeError as e:
        ner_logger.exception('Exception for numeric: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
    def _detect_time_with_coln_format(self, time_list, original_list):
        """
        This method is used to detect a specific time format of the form <hh>:<mm>
        1.  कल 5:30 बजे
        2.  आज १०:१५ बजे अजना

        Args:
            time_list (list): list of dicts consisting of the detected time entity
            original_list (list): list consisting of the origin subtext which is detected as time entity

        Returns:
            time_list (list): list of dicts consisting of the detected time entity
            original_list (list): list consisting of the origin subtext which is detected as time entity

        Example:

            >>> time_list = []
            >>> original_list = []
            >>> preprocessed_text = u'आज 05:40 बजे अजना'
            >>> self._detect_time_with_coln_format(time_list, original_list)
            >>> ([{'hh': 5, 'mm': 40, 'nn': 'pm', 'time_type': None}], ["05:40"])


        """
        patterns = re.findall(r'\s*((\d+)\:(\d+))\s*',
                              self.processed_text.lower(), re.U)
        if time_list is None:
            time_list = []
        if original_list is None:
            original_list = []

        for pattern in patterns:
            t1 = pattern[1]
            t2 = pattern[2]
            original = pattern[0]

            if len(t1) <= 2 and len(t2) <= 2:
                hh = int(t1)
                mm = int(t2)
                time = {
                    'hh': hh,
                    'mm': mm,
                    'tz': None if not self.timezone else self.timezone.zone,
                    'time_type': None
                }

                nn = self._get_meridiem(hh, mm, original)
                time.update({'nn': nn})

                original_list.append(original)
                time_list.append(time)

            ner_logger.debug("time_list %s" % str(time_list))
            ner_logger.debug("original_list %s" % str(original_list))

        return time_list, original_list
Exemplo n.º 16
0
    def _check_doc_type_for_elasticsearch(self):
        """
        Checks if doc_type is present in connection settings, if not an exception is raised

        Raises:
             DataStoreSettingsImproperlyConfiguredException if doc_type was not found in
             connection settings
        """
        # TODO: This check should be during init or boot
        if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings:
            ner_logger.debug("No doc type is present")
            raise DataStoreSettingsImproperlyConfiguredException(
                'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment'
            )
Exemplo n.º 17
0
def get_regex(message, entity_name, structured_value, fallback_value,
              bot_message, regex):
    """This functionality calls the RegexDetector class to detect text that abide by the specified
        regex.
        The meta_data consists the regex

    Attributes:
        NOTE: Explained above
        meta_data (dict) : It consists of the regex
    Output:
        NOTE: Explained above

    Example:

        message = 'abc123'
        entity_name = 'regex'
        meta_data = {'regex': '\d'}
        structured_value = None
        fallback_value = None
        bot_message = None
        output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value,
                        fallback_value=fallback_value, bot_message=bot_message, meta_data=meta_data)
        print output

            >> [{'detection': 'message', 'original_text': '1', 'entity_value': {'value': '1'}}]

    """
    ner_logger.debug("BEFORE AST LITERAL REGEX>>>>>>%s" % regex)
    ner_logger.debug("REGEX>>>>>>%s" % regex)
    regex_detection = RegexDetector(entity_name=entity_name, regex=regex)
    if structured_value:
        entity_list, original_text_list = regex_detection.detect_entity(
            text=structured_value)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list,
                                           FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_value(structured_value, structured_value,
                                            FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        entity_list, original_text_list = regex_detection.detect_entity(
            text=message)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list,
                                           FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_value(fallback_value, fallback_value,
                                            FROM_FALLBACK_VALUE)

    return None
Exemplo n.º 18
0
def combine_output(request):
    """This functionality calls the combine_output_of_detection_logic_and_tag()  through api call

    Attributes:
        request: url parameters

    """
    message = request.GET.get('message')
    entity_data = request.GET.get('entity_data', '{}')
    entity_data_json = json.loads(entity_data)
    ner_logger.debug('Start: %s ' % message)
    output = combine_output_of_detection_logic_and_tag(entity_data=entity_data_json, text=message)
    ner_logger.debug('Finished %s : %s ' % (message, output))
    return HttpResponse(json.dumps({'data': output}), content_type='application/json')
Exemplo n.º 19
0
    def check_if_index_exits(es_url, index_name):
        """
        This function checks if index exists in es_url

        Args
        es_url (string): The elsticsearch URL
        index_name (string): Name of the index to check the existence for
        """
        index_response = requests.get('{es_url}/_cat/indices?v'.format(**{"es_url": es_url}))
        # check if index is present in source
        if " " + index_name + " " not in index_response.content:
            message = index_name + " does not exist in " + es_url
            ner_logger.debug("check_if_index_exits - " + str(message))
            raise IndexNotFoundException(message)
Exemplo n.º 20
0
    def load_model(self, model_path=None, live_crf_model_path=None):
        """
        Method that will load model data for entity and initialize the tagger for the same.
        If no model_path is given data will be loaded from the S3 with the path from redis
        Args:
            model_path (str): Path from where model has to be loaded for the given entity.
            live_crf_model_path (str): Live path for the Crf Model
        Returns:
            tagger (pycrfsuite.Tagger()): Tagger with the loaded model
        """
        if model_path:
            file_handler = open(model_path, 'r')
            self.entity_model_dict = file_handler.read()
            ner_logger.debug('Model dir %s path from local' % model_path)
            return self.initialize_tagger()

        ner_logger.debug('Model dir %s path from api' % live_crf_model_path)
        if live_crf_model_path == self.loaded_model_path:
            if not self.entity_model_dict:
                self.entity_model_dict = read_model_dict_from_s3(bucket_name=CRF_MODEL_S3_BUCKET_NAME,
                                                                 bucket_region=CRF_MODEL_S3_BUCKET_REGION,
                                                                 model_path_location=live_crf_model_path)
                ner_logger.debug('New Model dir %s path from api' % live_crf_model_path)
            else:
                return self.tagger
        else:
            self.entity_model_dict = read_model_dict_from_s3(bucket_name=CRF_MODEL_S3_BUCKET_NAME,
                                                             bucket_region=CRF_MODEL_S3_BUCKET_REGION,
                                                             model_path_location=live_crf_model_path)
            ner_logger.debug('New Model dir %s path from cache' % live_crf_model_path)
            self.loaded_model_path = live_crf_model_path
        return self.initialize_tagger()
Exemplo n.º 21
0
    def __init__(self,
                 entity_name,
                 language=ENGLISH_LANG,
                 timezone='UTC',
                 past_date_referenced=False):
        """Initializes a DateDetector object with given entity_name and pytz timezone object

        Args:
            entity_name: A string by which the detected date entity substrings would be replaced with on calling
                        detect_entity()
            timezone (Optional, str): timezone identifier string that is used to create a pytz timezone object
                                      default is UTC
            past_date_referenced (bool): to know if past or future date is referenced for date text like 'kal', 'parso'
        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.date = []
        self.original_date_text = []
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        try:
            self.timezone = pytz.timezone(timezone)
        except Exception as e:
            ner_logger.debug('Timezone error: %s ' % e)
            self.timezone = pytz.timezone('UTC')
            ner_logger.debug('Default timezone passed as "UTC"')
        self.now_date = datetime.datetime.now(tz=self.timezone)
        self.bot_message = None
        self.language = language

        try:
            date_detector_module = importlib.import_module(
                'ner_v2.detectors.temporal.date.{0}.date_detection'.format(
                    self.language))
            self.language_date_detector = date_detector_module.DateDetector(
                entity_name=self.entity_name,
                past_date_referenced=past_date_referenced,
                timezone=self.timezone)
        except ImportError:
            standard_date_regex = importlib.import_module(
                'ner_v2.detectors.temporal.date.standard_date_regex')
            self.language_date_detector = standard_date_regex.DateDetector(
                entity_name=self.entity_name,
                data_directory_path=get_lang_data_path(
                    detector_path=os.path.abspath(__file__),
                    lang_code=self.language),
                timezone=self.timezone,
                past_date_referenced=past_date_referenced)
Exemplo n.º 22
0
    def _check_doc_type_for_elasticsearch(self):
        """
        Checks if doc_type is present in connection settings, if not an exception is raised

        Raises:
             DataStoreSettingsImproperlyConfiguredException if doc_type was not found in
             connection settings
        """
        if constants.ELASTICSEARCH_DOC_TYPE not in self._connection_settings:
            ner_logger.debug(
                "No doc type is present in chatbot_ner.config.CHATBOT_NER_DATASTORE"
            )
            raise DataStoreSettingsImproperlyConfiguredException(
                'Elasticsearch needs doc_type. Please configure ES_DOC_TYPE in your environment'
            )
Exemplo n.º 23
0
def ner(request):
    """This functionality calls the run_ner() functionality to tag the message .
    It is called through api call

    Attributes:
        request: url parameters

    """
    message = request.GET.get('message')
    entities_data = request.GET.get('entities', [])
    entities = []
    if entities_data:
        entities = ast.literal_eval(entities_data)
    ner_logger.debug('Start: %s -- %s' % (message, entities))
    output = run_ner(entities=entities, message=message)
    ner_logger.debug('Finished %s : %s ' % (message, output))
    return HttpResponse(json.dumps({'data': output}), content_type='application/json')
Exemplo n.º 24
0
    def generate_crf_model_path(self):
        """
        This method is used to generate the directory to store the entity along with the timestamp
        Returns:
            output_directory (str): The path where the model needs to be stored.
        """
        file_path = CRF_MODELS_PATH + self.entity_name
        entity_path = CRF_MODELS_PATH + self.entity_name + '/' + self.entity_name
        entity_directory = os.path.dirname(entity_path)
        file_directory = os.path.dirname(entity_path)
        if not os.path.exists(entity_directory):
            os.makedirs(file_directory)
            ner_logger.debug('creating new directory %s' % file_path)

        output_directory_prefix = CRF_MODELS_PATH + self.entity_name + '/'
        output_directory_postfix = datetime.now().strftime("%d%m%Y-%H%M%S")
        return output_directory_prefix + self.entity_name + output_directory_postfix
Exemplo n.º 25
0
 def load_word_vectors_local():
     """
     Thus function is used to load the word_list and word_vectors from the specified paths.
     Returns:
     vocab (list): word_list present at the specified path.
     word_vectors (numpy.array): word_vectors present at the specified path.
     """
     vocab = []
     word_vectors = np.array([])
     try:
         file_handler = open(CRF_EMBEDDINGS_PATH_VOCAB, 'rb')
         vocab = pickle.load(file_handler)
         file_handler = open(CRF_EMBEDDINGS_PATH_VECTORS, 'rb')
         word_vectors = np.array(pickle.load(file_handler))
     except Exception as e:
         ner_logger.debug('Error in loading local word vectors %s' % e)
     return vocab, word_vectors
Exemplo n.º 26
0
    def get_model_output(self, entity_type, bot_message, user_message):
        """
        This function is a calls all other in order get final json list of tagged data.
        
        If model has been loaded then it calls initialize_files(), add_data_to_tagger and run_crf to get the 
        tagged data otherwise it will throw an error message
        """
        output_list = []
        if MODEL_RUN:
            self.initialize_files(CITY_ENTITY_TYPE)
            self.add_data_to_tagger(bot_message, user_message)
            crf_output = self.run_crf()
            if entity_type == CITY_ENTITY_TYPE:
                output_list = self.generate_city_output(crf_data=crf_output)
        else:
            ner_logger.debug('MODEL IS NOT RUNNING: CRFPP not installed')

        return output_list
Exemplo n.º 27
0
def shopping_size(request):
    """This functionality calls the get_shopping_size() functionality to detect size. It is called through api call

    Attributes:
        request: url parameters

    """
    try:
        parameters_dict = parse_parameters_from_request(request)
        entity_output = get_shopping_size(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
                                          parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                          parameters_dict[PARAMETER_FALLBACK_VALUE],
                                          parameters_dict[PARAMETER_BOT_MESSAGE])
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))
    except TypeError as e:
        ner_logger.exception('Exception for shopping_size: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
Exemplo n.º 28
0
def delete_entity_data_by_values(connection, index_name, doc_type, entity_name, values=None, **kwargs):
    """
    Deletes entity data from ES for the specific entity depending on the values.

    Args:
        connection (elasticsearch.client.Elasticsearch): Elasticsearch client object
        index_name (str): The name of the index
        doc_type (str): The type of the documents that will be indexed
        entity_name (str): name of the entity for which the data is to be deleted.
        values (str, optional): List of values for which data is to be fetched.
            If None, all records are deleted
    Returns:
        None
    """
    results = get_entity_data(
        connection=connection,
        index_name=index_name,
        doc_type=doc_type,
        entity_name=entity_name,
        values=values,
        **kwargs
    )

    delete_bulk_queries = []
    str_query = []
    for record in results:
        delete_dict = {
            '_index': index_name,
            '_type': doc_type,
            '_id': record["_id"],
            '_op_type': 'delete',
        }
        str_query.append(delete_dict)
        if len(str_query) == constants.ELASTICSEARCH_BULK_HELPER_MESSAGE_SIZE:
            delete_bulk_queries.append(str_query)
            str_query = []

    if str_query:
        delete_bulk_queries.append(str_query)

    for delete_query in delete_bulk_queries:
        result = helpers.bulk(connection, delete_query, stats_only=True, **kwargs)
        ner_logger.debug('delete_entity_data_by_values: entity_name: {0} result {1}'.format(entity_name, str(result)))
Exemplo n.º 29
0
def phone_number(request):
    """This functionality calls the get_phone_number() functionality to detect phone numbers. It is called through api call

    Attributes:
        request: url parameters

    """
    entity_name = None
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % entity_name)
        entity_output = get_phone_number(parameters_dict[PARAMETER_MESSAGE], parameters_dict[PARAMETER_ENTITY_NAME],
                                         parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                         parameters_dict[PARAMETER_STRUCTURED_VALUE_VERIFICATION],
                                         parameters_dict[PARAMETER_FALLBACK_VALUE],
                                         parameters_dict[PARAMETER_EXPERT_MESSAGE])
    except Exception, e:
        entity_output = {}
        ner_logger.debug('Exception for phone_number: %s ' % e)
Exemplo n.º 30
0
    def __init__(self,
                 entity_name,
                 data_directory_path,
                 timezone='UTC',
                 range_enabled=False,
                 form_check=False):
        """
        Base Regex class which will be imported by language date class by giving their data folder path
        This will create standard regex and their parser to detect date for given language.
        Args:
            data_directory_path (str): path of data folder for given language
            timezone (str): user timezone default UTC
        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        try:
            self.timezone = pytz.timezone(timezone)
        except Exception as e:
            ner_logger.debug('Timezone error: %s ' % e)
            self.timezone = pytz.timezone('UTC')
            ner_logger.debug('Default timezone passed as "UTC"')
        self.now_date = datetime.datetime.now(tz=self.timezone)
        self.bot_message = None

        # dict to store words for time, numerals and words which comes in reference to some date
        self.time_constant_dict = {}
        self.datetime_constant_dict = {}
        self.numerals_constant_dict = {}

        # define dynamic created standard regex for time from language data files
        self.regex_time = None

        # Method to initialise value in regex
        self.init_regex_and_parser(data_directory_path)

        # Variable to define default order in which these regex will work
        self.detector_preferences = [
            self._detect_time_with_coln_format, self._detect_hour_minute
        ]