def get_records_from_values(entity_name, values=None): """ Fetch entity data based for the specified values in that entity Args: entity_name (str): Name of the entity for which records are to be fetched values (list, optional): List of str values for which the data is to be fetched Returns: dict: dictionary mapping the entity_value to a dictionary Sample: { 'entity_value': { 'en': { '_id': 'Random ES ID', 'value': ['Variant 1', 'Variant 2'] }, 'hi': { '_id': 'Random ES ID', 'value': ['Variant 1', 'Variant 2'] } } """ datastore_obj = DataStore() results = datastore_obj.get_entity_data(entity_name=entity_name, values=values) merged_records = {} for result in results: merged_records.setdefault(result['_source']['value'], {}) merged_records[result['_source']['value']][ result['_source']['language_script']] = { '_id': result['_id'], 'value': result['_source']['variants'], } return merged_records
def update_dictionary(request): """ This function is used to update the dictionary entities. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) entity_name = external_api_data.get(ENTITY_NAME) entity_data = external_api_data.get(ENTITY_DATA) language_script = external_api_data.get(LANGUAGE_SCRIPT) datastore_obj = DataStore() datastore_obj.update_entity_data(entity_name=entity_name, entity_data=entity_data, language_script=language_script) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def transfer_entities(request): """ This method is used to transfer entities from the source to destination. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) entity_list = external_api_data.get(ENTITY_LIST) datastore_object = DataStore() datastore_object.transfer_entities_elastic_search(entity_list=entity_list) response['success'] = True except (IndexNotFoundException, InvalidESURLException, SourceDestinationSimilarException, InternalBackupException, AliasNotFoundException, PointIndexToAliasException, FetchIndexForAliasException, DeleteIndexFromAliasException, AliasForTransferException, IndexForTransferException, NonESEngineTransferException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def train_model_from_es_data(self): """ This method is used to train the crf model by first extracting training data from ES for the entity and training the crf model for the same. """ datastore_object = DataStore() ner_logger.debug('Fetch of data from ES for ENTITY: %s started' % self.entity_name) result = datastore_object.get_crf_data_for_entity_name( entity_name=self.entity_name) sentence_list = result.get(SENTENCE_LIST, []) entity_list = result.get(ENTITY_LIST, []) if not sentence_list: raise ESCrfTrainingTextListNotFoundException() if not entity_list: raise ESCrfTrainingEntityListNotFoundException() ner_logger.debug('Fetch of data from ES for ENTITY: %s completed' % self.entity_name) ner_logger.debug('Length of text_list %s' % str(len(sentence_list))) model_path = self.train_crf_model_from_list( entity_list=entity_list, sentence_list=sentence_list) return model_path
def get_crf_training_data(request): """ This function is used obtain the training data given the entity_name. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : With data consisting of a dictionary consisting of sentence_list and entity_list Examples: get request params key: "entity_name" value: "city" """ response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) datastore_obj = DataStore() result = datastore_obj.get_crf_data_for_entity_name(entity_name=entity_name) response['result'] = result response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def entity_supported_languages(entity_name): """ Fetch list of supported languages for the specific entity Args: entity_name (str): Name of the entity for which unique values are to be fetched Returns: list: List of language_codes """ datastore_obj = DataStore() return datastore_obj.get_entity_supported_languages( entity_name=entity_name)
def delete_records_by_values(entity_name, values): """ Delete entity data based for the specified values in that entity Args: entity_name (str): Name of the entity for which records are to be fetched values (list): List of str values for which the data is to be fetched Returns: None """ datastore_obj = DataStore() datastore_obj.delete_entity_data_by_values(entity_name=entity_name, values=values)
def entity_update_languages(entity_name, new_language_list): """ Updates the language support list of the entity by creating dummy records. Currently does not support removal of a language. It creates empty variant records for all the unique values present in this entity. Args: entity_name (str): Name of the entity for which unique values are to be fetched new_language_list (list): List of language codes for the new entity Returns: bool: Success flag if the update Raises: APIHandlerException (Exception): for any validation errors """ old_language_list = entity_supported_languages(entity_name) languages_added = set(new_language_list) - set(old_language_list) languages_removed = set(old_language_list) - set(new_language_list) if languages_removed: # raise exception as it is not currently supported raise APIHandlerException( 'Removing languages is not currently supported.') if not languages_added: # no change in language list. raise error raise APIHandlerException( 'No new languages provided. Nothing changed.') # fetch all words # TODO: If possible add records in single ES query instead of # two (get_entity_unique_values + db.add_entity_data) values = get_entity_unique_values(entity_name=entity_name) if not values: raise APIHandlerException( 'This entity does not have any records. Please verify the entity name' ) records_to_create = [] for language_script in languages_added: # create records for all words for value in values: if value and language_script: records_to_create.append({ 'value': value, 'language_script': language_script, 'variants': [] }) datastore_obj = DataStore() datastore_obj.add_entity_data(entity_name, records_to_create) return True
def get_entity_word_variants(request): """ This function is used obtain the entity dictionary given the dictionary name. Args: request (HttpResponse): HTTP response from url Returns: HttpResponse : With data consisting of a list of value variants. """ response = {"success": False, "error": "", "result": []} try: entity_name = request.GET.get(ENTITY_NAME) datastore_obj = DataStore() result = datastore_obj.get_entity_dictionary(entity_name=entity_name) structured_result = [] # The list around result.keys() is to make it compatible to python3 key_list = list(result.keys()) key_list.sort() for value in key_list: structured_result.append({ 'value': value, 'variants': result[value] }) result = structured_result response['result'] = result response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def update_crf_training_data(request): """ This function is used to update the training data Args: request (HttpRequest): HTTP response from url Returns: HttpResponse : HttpResponse with appropriate status and error message. Example for data present in Post request body key: "external_api_data" value: {"sentence_list":["hello pratik","hello hardik"], "entity_list":[["pratik"], ["hardik"]], "entity_name":"training_try3", "language_script": "en"} """ response = {"success": False, "error": "", "result": []} try: external_api_data = json.loads(request.POST.get(EXTERNAL_API_DATA)) sentences = external_api_data.get(SENTENCES) entity_name = external_api_data.get(ENTITY_NAME) DataStore().update_entity_crf_data(entity_name=entity_name, sentences=sentences) response['success'] = True except (DataStoreSettingsImproperlyConfiguredException, EngineNotImplementedException, EngineConnectionException, FetchIndexForAliasException) as error_message: response['error'] = str(error_message) ner_logger.exception('Error: %s' % error_message) return HttpResponse(json.dumps(response), content_type='application/json', status=500) except BaseException as e: response['error'] = str(e) ner_logger.exception('Error: %s' % e) return HttpResponse(json.dumps(response), content_type='application/json', status=500) return HttpResponse(json.dumps(response), content_type='application/json', status=200)
def update_entity_records(entity_name, data): """ Update dictionary data with the edited and deleted records Args: entity_name (str): Name of the entity for which records are to be fetched data (dict): Dictionary of edited, deleted data. If replace flag is true, then all existing data is deleted before adding the records Returns: None """ # Delete some records first records_to_delete = data.get('deleted', []) records_to_create = data.get('edited', []) replace_data = data.get('replace') if replace_data: # TODO: Delete everything for the `entity_name` without having to fetch values first! # https://www.elastic.co/guide/en/elasticsearch/reference/5.6/docs-delete-by-query.html values_to_delete = get_entity_unique_values(entity_name) else: values_to_delete = [record['word'] for record in records_to_delete] values_to_delete.extend( [record['word'] for record in records_to_create]) value_variants_to_create = [] for record in records_to_create: for language_script, variants in record.get('variants', {}).items(): if record['word'] and language_script: value_variants_to_create.append({ 'value': record['word'], 'language_script': language_script, 'variants': variants.get('value', []) }) # delete words delete_records_by_values(entity_name=entity_name, values=values_to_delete) datastore_obj = DataStore() datastore_obj.add_entity_data(entity_name, value_variants_to_create)
def update_entity_records(entity_name, data): """ Update dictionary data with the edited and deleted records Args: entity_name (str): Name of the entity for which records are to be fetched data (dict): Dictionary of edited, deleted data. If replace flag is true, then all existing data is deleted before adding the records Returns: None """ # Delete some records first records_to_delete = data.get('deleted', []) records_to_create = data.get('edited', []) replace_data = data.get('replace') if replace_data: values_to_delete = get_entity_unique_values(entity_name) else: values_to_delete = [record['word'] for record in records_to_delete] values_to_delete.extend( [record['word'] for record in records_to_create]) value_variants_to_create = [] for record in records_to_create: for language_script, variants in record.get('variants', {}).items(): if record['word'] and language_script: value_variants_to_create.append({ 'value': record['word'], 'language_script': language_script, 'variants': variants.get('value', []) }) # delete words delete_records_by_values(entity_name=entity_name, values=values_to_delete) datastore_obj = DataStore() datastore_obj.add_entity_data(entity_name, value_variants_to_create)
def get_entity_unique_values(entity_name, empty_variants_only=False, value_search_term=None, variant_search_term=None): """ Get a list of unique values belonging to this entity Args: entity_name (str): Name of the entity for which unique values are to be fetched empty_variants_only (bool, optional): Flag to search for values with empty variants only value_search_term (str, optional): Search term to filter values from this entity data variant_search_term (str, optional): Search term to filter out variants from the entity data Returns: list: List of strings which are unique values in the entity """ datastore_obj = DataStore() return datastore_obj.get_entity_unique_values( entity_name=entity_name, value_search_term=value_search_term, variant_search_term=variant_search_term, empty_variants_only=empty_variants_only)