def get_usage_from_es(index_name, query): es_obj = elasticsearch_connector.get_instance() result = es_obj.generic_search_query(index_name, query) if result: return result["hits"]["total"]["value"] else: return 0
def get_feedback_count_per_user_for_all_documents(user_id): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() user_like_dislike = {} aggregation_query = { "query": { "match": { "userId": { "query": user_id } } }, "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback": { "terms": { "field": "feedback" } } } } } } try: result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) if result: aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] if buckets: for bucket in buckets: doc_id, liked_status, disliked_status = 0, False, False liked_count, disliked_count = 0, 0 doc_id = bucket['key'] inner_bucket = bucket['group_by_feedback']['buckets'] for feedback in inner_bucket: if feedback['key'] == 1: liked_count = feedback['doc_count'] elif feedback['key'] == -1: disliked_count = feedback['doc_count'] if int(liked_count) > int(disliked_count) : liked_status = True elif int(liked_count) < int(disliked_count): disliked_status = True user_like_dislike[doc_id] = [liked_status, disliked_status] return user_like_dislike else: return False else: return False except: return False
def get_event_id_from_index(self): config = utils.config_parser() latest_event_index = config.get('elasticsearch', 'app_data_index') es_obj = elasticsearch_connector.get_instance() search_query = {"query": {"match": {"_id": "cursor_id"}}} if es_obj.check_if_index_exists(index_name=latest_event_index): json_data = es_obj.generic_search_query(latest_event_index, search_query) hits = json_data['hits']['hits'] if not hits: es_obj.insert_document(latest_event_index, 'pptx', 'cursor_id', {'cursor_event_id': 0}) logger.info("App Data Index Created Successfully") return False else: for hit in hits: hit_source = hit.get('_source') if 'cursor_event_id' in hit_source: latest_event_id = hit_source.get('cursor_event_id') return latest_event_id else: es_obj.insert_document(latest_event_index, 'pptx', 'cursor_id', {'cursor_event_id': 0}) logger.info("App Data Index Created Successfully") return False
def get_download_count_for_document(doc_id): config = utils.config_parser() index_name = config.get('elasticsearch', 'download_logs_index_name') es_obj = elasticsearch_connector.get_instance() search_query = { "aggs": { "user_download_aggregation": { "filter": { "term": { "doc_id": doc_id } }, "aggs": { "num_of_downloads": { "terms": { "field": "doc_id.keyword" } } } } } } try: json_data = es_obj.generic_search_query(index_name, search_query) buckets = json_data['aggregations']["user_download_aggregation"][ "num_of_downloads"]["buckets"] if buckets: return buckets[0]["doc_count"] else: return 0 except: return 0
def create_document_preview_response(doc_id): config = utils.config_parser() corpus_index_name = config.get('elasticsearch', 'corpus_index_name') elastic_obj = elasticsearch_connector.get_instance() search_query = { "query": { "match": { "_id": { "query": doc_id } } }, "_source": { "includes": [ "file_name", "title", "url", "doc_type", "created_by", "modified_by", "num_downloads", "ratings", "created_time", "modified_time", "slides.page_number", "slides.thumbnail_large" ] } } result = elastic_obj.generic_search_query(corpus_index_name, search_query) if result['hits']['hits']: return result['hits']['hits'][0]['_source'] else: return {}
def get_feedback_count_for_document(doc_id): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() aggregation_query = { "query": { "match_phrase": { "doc_id": { "query": doc_id } } }, "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback": { "terms": { "field":"feedback" } } } } } } try: result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) if result: aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] if buckets: feedback_count, num_likes, num_dislikes = 0, 0, 0 inner_bucket = buckets[0]['group_by_feedback']['buckets'] for feedback in inner_bucket: feedback_count += feedback['key'] * feedback['doc_count'] if feedback['key'] == 1: num_likes = feedback['doc_count'] elif feedback['key'] == -1: num_dislikes = feedback['doc_count'] return feedback_count, num_likes, num_dislikes else: return 0, 0, 0 else: return 0, 0, 0 except: return 0, 0, 0
def log_file_download_event(log_data): config = utils.config_parser() file_download_index = config.get('elasticsearch', 'download_logs_index_name') elastic_obj = elasticsearch_connector.get_instance() result = elastic_obj.insert_document(file_download_index, "pptx", None, log_data) return result
def log_autosuggest_feedback_event(log_data): config = utils.config_parser() autosuggest_feedback_index = config.get( 'elasticsearch', 'autosuggest_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() result = elastic_obj.insert_document(autosuggest_feedback_index, "pptx", None, log_data) return result
def log_subjective_feedback(payload_data): config = utils.config_parser() subjective_feedback_index = config.get('elasticsearch', 'subjective_feedback_index') elastic_obj = elasticsearch_connector.get_instance() response = elastic_obj.insert_document(subjective_feedback_index, "pptx", None, payload_data) return response
def update_ratings_for_all_docs(): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') corpus_index_name = config.get('elasticsearch', 'corpus_index_name') doc_type = config.get('elasticsearch', 'doc_type') elastic_obj = elasticsearch_connector.get_instance() aggregation_query = { "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback":{ "terms": { "field": "feedback" } } } } } } result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] for item in buckets: key = item['key'] feedback_count, num_likes, num_dislikes = 0, 0, 0 inner_bucket = item['group_by_feedback']['buckets'] for feedback in inner_bucket: feedback_count += feedback['key'] * feedback['doc_count'] if feedback['key'] == 1: num_likes = feedback['doc_count'] elif feedback['key'] == -1: num_dislikes = feedback['doc_count'] ratings = { "script" : { "source": "ctx._source.ratings = params.ratings; ctx._source.num_likes = params.num_likes; ctx._source.num_dislikes = params.num_dislikes", "lang": "painless", "params": { "ratings": feedback_count, "num_likes": num_likes, "num_dislikes": num_dislikes } } } result = elastic_obj.update_document(corpus_index_name, doc_type, key, ratings) if result: logger.info("Aggregated ratings updated on corpus index") else: logger.error("Could not aggregate ratings on corpus index")
def log_user_feedback(is_authenticated, is_authorized): if not is_authorized: return render_template("unauthorized_user.html"), 401 data_json = request.get_json() payload_data = {} payload_data['docID'] = data_json.get('docId') payload_data['searchQuery'] = data_json.get('searchQuery') payload_data['feedback_timestamp'] = data_json.get('DateTime') payload_data['feedback'] = data_json.get('feedback') for field in payload_data: response_msg = check_field_validations(payload_data, field) if response_msg: return Response(json.dumps(response_msg), status=400, mimetype='application/json') es_connect = elasticsearch_connector.get_instance() log_data = {} response_data = {} response_data['liked_status'], response_data[ 'disliked_status'] = False, False log_data.update({ 'userId': session['unique_id'].lower(), 'doc_id': payload_data['docID'], 'search_query': payload_data['searchQuery'], 'feedback_timestamp': payload_data['feedback_timestamp'], 'feedback': int(payload_data['feedback']) }) response_data['doc_id'] = log_data['doc_id'] if log_data['feedback'] == 1: response_data['liked_status'] = True elif log_data['feedback'] == -1: response_data['disliked_status'] = True logged_result = user_feedback_logger.log_feedback_event(log_data) if logged_result: response_data['num_likes'], response_data[ 'num_dislikes'] = user_feedback_logger.update_feedback_count_for_document( data_json.get('docId'), int(data_json.get('feedback'))) return Response(json.dumps(response_data), status=200, mimetype='application/json') else: return Response(json.dumps( {'failure': 'Could not log feedback from the user'}), status=200, mimetype='application/json')
def update_download_count_for_document(doc_id, update_query): config = utils.config_parser() corpus_index_name = config.get('elasticsearch', 'corpus_index_name') doc_type = config.get('elasticsearch', 'doc_type') es_obj = elasticsearch_connector.get_instance() try: es_obj.update_document(corpus_index_name, doc_type, doc_id, update_query) return True except: logger.exception("Updating download count is failed for %s " % (doc_id)) return False
def parse_documents(file_relative_path): config = utils.config_parser() destination_local_path = config.get('generic', 'corpus_download_path') try: es_connect = elasticsearch_connector.get_instance() # es_connect.clear_index() except: logger.exception("Cannot connect to elastic search") config = utils.config_parser() corpus_index = config.get('elasticsearch', 'corpus_index_name') number_of_indexed_files, no_of_files_having_indexing_error, files_having_indexing_error = \ corpus_indexer.parse_and_index_documents(destination_local_path, es_connect, corpus_index, file_relative_path) corpus_indexer.clear_corpus_download_directory(destination_local_path) return number_of_indexed_files, no_of_files_having_indexing_error, files_having_indexing_error
def get_recently_added_documents(is_authenticated, is_authorized): # This endpoint will get recently added documents from corpus index based on indexing_time parameter if not is_authorized: return render_template("unauthorized_user.html"), 401 config = utils.config_parser() corpus_index = config.get('elasticsearch', 'corpus_index_name') es_obj = elasticsearch_connector.get_instance() recent_documents_name_id = [] recently_added_documents = { "_source": ["source_path", "file_name", "title", "indexing_time"], "query": { "match_all": {} }, "sort": [{ "indexing_time": { "order": "desc" } }] } response = es_obj.generic_search_query(corpus_index, recently_added_documents, size=10) if response: for hits in response['hits']['hits']: recent_data = {} hits_source = hits.get('_source') recent_data['doc_id'] = hits.get('_id') recent_data['source_path'] = hits_source.get('source_path') recent_data['file_name'] = hits_source.get('file_name') title = re.sub(r'^\b(Xoriant )', '', hits_source.get('title'), flags=re.IGNORECASE).strip() recent_data['title'] = title recent_data['indexing_time'] = hits_source.get('indexing_time') recent_documents_name_id.append(recent_data) if recent_documents_name_id: return Response(json.dumps(recent_documents_name_id), status=200, mimetype='application/json') else: return Response(json.dumps( {'failure': 'Error in getting recently added docuemnts'}), status=400, mimetype='application/json')
def reset_ratings_likes_dislikes_for_all_indexed_docs(): config = utils.config_parser() index_name = config.get("elasticsearch", "corpus_index_name") doc_type = config.get("elasticsearch", "doc_type") reset_query={ "script": { "source": "ctx._source.ratings = 0; ctx._source.num_likes = 0; ctx._source.num_dislikes = 0", "lang": "painless" }, "query": { "match_all": {} } } es_obj = elasticsearch_connector.get_instance() return es_obj.update_index_by_query(index_name, doc_type, reset_query)
def store_event_id_in_index(self, latest_parsed_event_id): config = utils.config_parser() latest_event_index = config.get('elasticsearch', 'app_data_index') es_obj = elasticsearch_connector.get_instance() update_event_id = { "script": { "source": "ctx._source.cursor_event_id = params.cursor_event_id", "params": { "cursor_event_id": latest_parsed_event_id } } } es_obj.update_document(latest_event_index, 'pptx', 'cursor_id', update_event_id)
def index_all_on_schedule(): config = utils.config_parser() job_enable = config.get("scheduler", "enable").lower() if job_enable == "true": try: logger.info("Starting the cron job") es_obj = elasticsearch_connector.get_instance() ES_Backup_Restore.backup("True") index_keyword_updater.update() logger.info("Index All Cron job Done") except Exception as e: logger.exception("Index using schedule Exception Occurred.") else: logger.info("Scheduler job disabled")
def get_aggregated_download_logs(doc_id): config = utils.config_parser() index_name = config.get('elasticsearch', 'aggregated_download_logs_index_name') search_query = { "query": { "match": { "num_of_downloads.buckets.key": { "query": doc_id } } } } es_connector = elasticsearch_connector.get_instance() response = es_connector.generic_search_query(index_name, search_query) return response
def reset_download_count_for_all_indexed_docs(property_name, property_value): config = utils.config_parser() index_name = config.get("elasticsearch", "corpus_index_name") doc_type = config.get("elasticsearch", "doc_type") reset_query = { "script": { "source": "ctx._source." + property_name + "= " + property_value, "lang": "painless" }, "query": { "match_all": {} } } es_obj = elasticsearch_connector.get_instance() return es_obj.update_index_by_query(index_name, doc_type, reset_query)
def get_recent_queries_from_es(index_name, query): es_obj = elasticsearch_connector.get_instance() result = es_obj.generic_search_query(index_name, query) recent_query_list = [] if result: if len(result["hits"]["hits"]) > 0: for hit in result["hits"]["hits"]: query = hit["_source"]["query"].lower() if query not in recent_query_list: recent_query_list.append(query) if recent_query_list.__len__() == 10: break else: return [] else: return [] return recent_query_list
def get_access_token_from_es(): from elasticsearch_connector import elasticsearch_connector es_connect = elasticsearch_connector.get_instance() key = b'5Hgi9bhDpDtVg69M7wAjiYYCzr9wlwvWCNlJdp5pWf0=' cipher_suite = Fernet(key) access_token = es_connect.get_egnyte_token() if access_token: access_token = access_token.encode('utf-8') decoded_access_token = cipher_suite.decrypt(access_token) return decoded_access_token.decode('utf-8') else: access_token_from_egnyte = egnyte_connector.get_access_token_from_egnyte( ) encoded_text = cipher_suite.encrypt( str.encode(access_token_from_egnyte)) es_connect.set_egnyte_token(encoded_text) return access_token_from_egnyte
def get_user_role(user_id): config = utils.config_parser() kmp_users_index = config.get('elasticsearch', 'kmp_users_index') es_obj = elasticsearch_connector.get_instance() if es_obj.check_if_index_exists(kmp_users_index): search_query = {"query": {"match": {"_id": user_id}}} json_data = es_obj.generic_search_query(kmp_users_index, search_query) hits = json_data['hits']['hits'] if hits: hit_source = hits[0].get('_source') user_role = hit_source.get('role') return user_role else: return None else: return None
def log_file_download_event(is_authenticated, is_authorized): if not is_authorized: return render_template("unauthorized_user.html"), 401 data_json = request.get_json() es_connect = elasticsearch_connector.get_instance() payload_data = {} payload_data['doc_id'] = data_json.get('doc_id') payload_data['search_query'] = data_json.get('search_query') payload_data['searched_result_index'] = data_json.get( 'searched_result_index') for field in payload_data: response_msg = check_field_validations(payload_data, field) if response_msg: return Response(json.dumps(response_msg), status=400, mimetype='application/json') log_data = {} log_data.update({ 'userId': session['unique_id'].lower(), 'doc_id': payload_data['doc_id'], 'search_query': payload_data['search_query'], 'download_timestamp': datetime.utcnow(), 'searched_result_index': payload_data['searched_result_index'] }) logged_result = file_download_logger.log_file_download_event(log_data) file_download_logger.update_download_count_for_document_by_1( data_json.get('doc_id')) if logged_result: return Response(json.dumps(logged_result), status=200, mimetype='application/json') else: return Response(json.dumps( {'failure': 'Could not log the file download event.'}), status=200, mimetype='application/json')
def download_a_file(self, file_obj): client = self.egnyte_client config = utils.config_parser() corpus_directory_path = config.get('egnyte', 'corpus_path') file_path_separator = config.get('egnyte', 'file_path_separator') corpus_index_name = config.get('elasticsearch', 'corpus_index_name') config_parser_version = config.get('egnyte', 'parser_version') destination_local_path = config.get('generic', 'corpus_download_path') try: if not os.path.exists(destination_local_path): os.mkdir(destination_local_path) except Exception as e: logger.exception(e) relative_path = file_obj.path.split(corpus_directory_path)[1] doc_id = utils.generate_doc_id(relative_path, file_path_separator) es_connect = elasticsearch_connector.get_instance() indexed_parameters = es_connect.return_index_parameter( doc_id, corpus_index_name, ['checksum', 'parser_version']) # Verify if file already indexed having same checksum and also check for parser version if indexed_parameters is False or indexed_parameters[ 'checksum'] != file_obj.checksum or indexed_parameters[ 'parser_version'] < int(config_parser_version): # Download the file and get/update checksum if it's not present already logger.info("Downloading file %s" % file_obj.name) file = client.file(file_obj.path) file_resource = file.download() file_resource.save_to(destination_local_path + '/' + file_obj.name, ) logger.info("File %s updating index." % file_obj.name) key = destination_local_path + '/' + file_obj.name self.file_relative_path[key] = relative_path checksum_key = key + 'checksum' self.file_relative_path[checksum_key] = file_obj.checksum parser_key = key + 'parser_version' self.file_relative_path[parser_key] = int(config_parser_version) self.downloaded_files_count = self.downloaded_files_count + 1 else: self.already_indexed_count = self.already_indexed_count + 1 logger.info("File %s already indexed." % file_obj.name)
def fetch_users_queries_usage_from_es(period, gte, lte): logs_index = config.get("elasticsearch", "logs_index_name") query = { "size": 0, "query": { "bool": { "must": [{ "term": { "query_type": "search" } }, { "range": { "query_timestamp": { "gte": gte, "lte": lte } } }] } }, "aggs": { "users_distinct": { "cardinality": { "field": "userId.keyword" } }, "number_of_queries": { "value_count": { "field": "query.keyword" } } } } es_obj = elasticsearch_connector.get_instance() result = es_obj.generic_search_query(logs_index, query) if result: queries_count = result["aggregations"]['number_of_queries'][ 'value'] users_count = result["aggregations"]['users_distinct']['value'] return users_count, queries_count else: return 0, 0
def get_trending_queries_from_es(index_name, query): es_obj = elasticsearch_connector.get_instance() result = es_obj.generic_search_query(index_name, query) trending_query_list = [] if result: if len(result["aggregations"]["trending_query"]["buckets"]) > 0: for bucket in result["aggregations"]["trending_query"][ "buckets"]: query = bucket["key"].lower() if query not in trending_query_list: trending_query_list.append(query) if trending_query_list.__len__() == 10: logger.info("trending_query_list: %s" % trending_query_list) break else: return [] else: return [] return trending_query_list
def log_feedback_event(log_data): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') es_obj = elasticsearch_connector.get_instance() get_feedback_data = { "query": { "bool": { "must": [ {"match": {"userId": log_data['userId']}}, {"match_phrase": {"doc_id": log_data['doc_id']}}, ] } } } json_data = es_obj.generic_search_query(user_feedback_index, get_feedback_data) feedback_data = json_data['hits']['hits'] if feedback_data: update_feedback = { "script": { "source": "ctx._source.feedback = params.feedback", "lang": "painless", "params": { "feedback": log_data['feedback'] } }, "query": { "bool": { "must": [ {"match": {"userId": log_data['userId']}}, {"match_phrase": {"doc_id": log_data['doc_id']}} ] } } } result = es_obj.update_index_by_query(user_feedback_index, "pptx", update_feedback) else: result = es_obj.insert_document(user_feedback_index, "pptx", None, log_data) return result
def get_aggregated_download_count_for_all_downloaded_docs(): config = utils.config_parser() index_name = config.get('elasticsearch', 'download_logs_index_name') es_obj = elasticsearch_connector.get_instance() aggs_query = { "aggs": { "num_of_downloads": { "terms": { "field": "doc_id.keyword", "size": 10000 } } } } try: json_data = es_obj.generic_search_query(index_name, aggs_query) return json_data['aggregations']['num_of_downloads']['buckets'] except Exception as e: logger.error( "Failed to get aggregated download count for all downloaded documents" ) return []
def fetch_avg_query_time_from_es(gte, lte): index_name = config.get("elasticsearch", "logs_index_name") query = { "size": 0, "query": { "bool": { "must": [{ "range": { "query_timestamp": { "gte": gte, "lte": lte } } }] } }, "aggs": { "avg_time": { "avg": { "script": "doc['response_timestamp'].value.getMillis() - doc['query_timestamp'].value.getMillis()" } } } } es_obj = elasticsearch_connector.get_instance() result = es_obj.generic_search_query(index_name, query) if result: if result["aggregations"]["avg_time"]["value"] != None: avg_query_time = round( (result["aggregations"]["avg_time"]["value"] / 1000), 3) return avg_query_time else: return 0 else: return 0
def update_feedback_count_for_document(doc_id, feedback): config = utils.config_parser() # user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') corpus_index_name = config.get('elasticsearch', 'corpus_index_name') doc_type = config.get('elasticsearch', 'doc_type') es_obj = elasticsearch_connector.get_instance() feedback_rating, num_likes, num_dislikes = user_feedback_logger.get_feedback_count_for_document(doc_id) update_download_count = { "script": { "source": "ctx._source.ratings = params.ratings; ctx._source.num_likes = params.num_likes; ctx._source.num_dislikes = params.num_dislikes", "lang": "painless", "params": { "ratings": feedback_rating, "num_likes": num_likes, "num_dislikes": num_dislikes } } } es_obj.update_document(corpus_index_name, doc_type, doc_id, update_download_count) return num_likes, num_dislikes