def parse_documents(file_relative_path): config = utils.config_parser() destination_local_path = config.get('generic', 'corpus_download_path') try: es_connect = elasticsearch_connector.get_instance() # es_connect.clear_index() except: logger.exception("Cannot connect to elastic search") config = utils.config_parser() corpus_index = config.get('elasticsearch', 'corpus_index_name') number_of_indexed_files, no_of_files_having_indexing_error, files_having_indexing_error = \ corpus_indexer.parse_and_index_documents(destination_local_path, es_connect, corpus_index, file_relative_path) corpus_indexer.clear_corpus_download_directory(destination_local_path) return number_of_indexed_files, no_of_files_having_indexing_error, files_having_indexing_error
def set_egnyte_token(self,access_token): config = utils.config_parser() app_data = config.get('elasticsearch', 'app_data_index') document = {'token': access_token.decode('utf-8')} document_id = 'egnyte_token' self.insert_document(app_data, 'pptx', document_id, document) logger.info("Saved egnyte token to ElasticSearch")
def getLogger(): if not os.path.exists("./logs"): os.mkdir("./logs") logging.basicConfig( format= "%(asctime)s [%(filename)-25.25s] [%(levelname)-5.5s] %(message)s", handlers=[ TimedRotatingFileHandler('./logs/log', when="midnight", interval=1, utc=False), logging.StreamHandler(), ]) config = utils.config_parser() log_level = config.get('generic', 'log_level') logger = logging.getLogger() if log_level == "DEBUG": logger.setLevel(logging.DEBUG) if log_level == "INFO" or log_level == "": logger.setLevel(logging.INFO) if log_level == "ERROR": logger.setLevel(logging.ERROR) return logger
def create_document_preview_response(doc_id): config = utils.config_parser() corpus_index_name = config.get('elasticsearch', 'corpus_index_name') elastic_obj = elasticsearch_connector.get_instance() search_query = { "query": { "match": { "_id": { "query": doc_id } } }, "_source": { "includes": [ "file_name", "title", "url", "doc_type", "created_by", "modified_by", "num_downloads", "ratings", "created_time", "modified_time", "slides.page_number", "slides.thumbnail_large" ] } } result = elastic_obj.generic_search_query(corpus_index_name, search_query) if result['hits']['hits']: return result['hits']['hits'][0]['_source'] else: return {}
def restore(backup_path, restore_thumbnail="True"): try: config = utils.config_parser() host = config.get('elasticsearch', 'host') port = config.get('elasticsearch', 'port') es_dump_auth = config.get('elasticsearch', 'es_dump_auth') doc_type = config.get('elasticsearch', 'doc_type') index_list = config.get('elasticsearch', 'backup_indices') output_ip = 'https://' + host + ":" + port for index in index_list.split(','): logger.info("Restoring index %s" % index) command = "NODE_TLS_REJECT_UNAUTHORIZED=0 elasticdump --input=" + backup_path + "/" + index + ".json" +" --headers '{\"Authorization\":\"Basic "+es_dump_auth+"\"}'"+ " --output=" + output_ip + \ " --type=data --output-index=" + index + "/" + doc_type os.system(command) if restore_thumbnail == "True": dest_thumbnail_folder = config.get('generic', 'thumbnail_path') source_thumbnail_folder = backup_path + '/thumbnail.zip' if os.path.exists(dest_thumbnail_folder): shutil.rmtree(dest_thumbnail_folder) shutil.unpack_archive(source_thumbnail_folder, dest_thumbnail_folder, 'zip') logger.info("Thumbnails are restored successfully.") else: logger.info("Thumbnails are not restored.") except Exception as e: logger.exception("Elasticsearch es_backup failed")
def get_download_count_for_document(doc_id): config = utils.config_parser() index_name = config.get('elasticsearch', 'download_logs_index_name') es_obj = elasticsearch_connector.get_instance() search_query = { "aggs": { "user_download_aggregation": { "filter": { "term": { "doc_id": doc_id } }, "aggs": { "num_of_downloads": { "terms": { "field": "doc_id.keyword" } } } } } } try: json_data = es_obj.generic_search_query(index_name, search_query) buckets = json_data['aggregations']["user_download_aggregation"][ "num_of_downloads"]["buckets"] if buckets: return buckets[0]["doc_count"] else: return 0 except: return 0
def download_files_based_on_trigger(self, egnyte_uploaded_files): config = utils.config_parser() corpus_directory_path = config.get('egnyte', 'corpus_path') list_of_egnyte_files = egnyte_uploaded_files.split(',') count_of_files = len(list_of_egnyte_files) self.downloaded_files_count = 0 self.already_indexed_count = 0 self.skipped_files_count = 0 for index, file in enumerate(list_of_egnyte_files): if corpus_directory_path not in file: list_of_egnyte_files[index] = corpus_directory_path + file relative_file_path = Egnyte_File_Operations.get_files_recursively( self, corpus_directory_path, list_of_egnyte_files) self.file_parsing_details['count_of_files'] = count_of_files self.file_parsing_details[ 'downloaded_files_count'] = self.downloaded_files_count self.file_parsing_details[ 'already_indexed_count'] = self.already_indexed_count self.file_parsing_details[ 'skipped_files_count'] = self.skipped_files_count return relative_file_path, self.file_parsing_details
def parse(path_to_file, file_relative_path): """Return data and meta-data of a ppt file in JSON format Args: path_to_file (str): path of file Return: data and meta-data of a given file in JSON format """ file_data = None try: config = utils.config_parser() file_path_separator = config.get('egnyte', 'file_path_separator') ppt = Presentation(path_to_file) date_format = "%Y-%m-%d %H:%M:%S" file_data = ppt_parser.parse_metadata(ppt, path_to_file, file_relative_path) doc_id = utils.generate_doc_id(file_data['source_path'], file_path_separator) thumbnail_image_name = doc_id larger_thumbnail_list, smaller_thumbnail_list = thumbnail_generator.generate_thumbnail_image( path_to_file, thumbnail_image_name) if larger_thumbnail_list == [] or smaller_thumbnail_list == []: return None file_data['slides'] = ppt_parser.parse_content( ppt, larger_thumbnail_list, smaller_thumbnail_list, doc_id) file_data['title'] = ppt_parser.extract_document_level_title( file_data['slides'], file_data['file_name']) except Exception as e: logger.error("Failed to open file %s due to error %s" % (path_to_file, str(e))) return None return file_data
class authorization(): config = utils.config_parser() token_url = config.get('OAuth', 'token_url') callback_uri = config.get('OAuth', 'callback_uri') client_id = config.get('OAuth', 'client_id') authorize_url = config.get('OAuth', 'authorize_url') resource = config.get('OAuth', 'resource') def getAuthURL(self): authorization_redirect_url = self.authorize_url + '?response_type=code&client_id=' + self.client_id + \ '&redirect_uri=' + self.callback_uri + '&resource='+self.resource return authorization_redirect_url def getAccessToken(self, authorization_code): data = {'grant_type': 'authorization_code', 'code': authorization_code, 'redirect_uri': self.callback_uri, 'client_id': self.client_id} access_token_response = requests.post(self.token_url, data=data, verify=False, allow_redirects=False) tokens = json.loads(access_token_response.text) access_token = tokens['access_token'] return access_token def decode_jwt(self, access_token): decoded_token = jwt.decode(access_token, verify=0, algorithm='RS256') return decoded_token
def get_access_token_from_egnyte(): """ Get access token from egnyte Return: access_token on successful connection """ config = utils.config_parser() payload = { 'grant_type': 'password', 'client_id': config.get('egnyte', 'api_key'), 'username': config.get('egnyte', 'username'), 'password': config.get('egnyte', 'password') } session = requests.session() # post call to connect to egnyte. This will return access token for active session access_token_endpoint = config.get('egnyte', 'access_token_endpoint') token = session.post(access_token_endpoint, data=payload) if token.status_code == 200: access_token = ( token.text.split(':'))[1].split(',')[0].split('"')[1] return access_token else: logger.exception( "Exception getting access token from egnyte due to %s" % token.text)
def get_event_id_from_index(self): config = utils.config_parser() latest_event_index = config.get('elasticsearch', 'app_data_index') es_obj = elasticsearch_connector.get_instance() search_query = {"query": {"match": {"_id": "cursor_id"}}} if es_obj.check_if_index_exists(index_name=latest_event_index): json_data = es_obj.generic_search_query(latest_event_index, search_query) hits = json_data['hits']['hits'] if not hits: es_obj.insert_document(latest_event_index, 'pptx', 'cursor_id', {'cursor_event_id': 0}) logger.info("App Data Index Created Successfully") return False else: for hit in hits: hit_source = hit.get('_source') if 'cursor_event_id' in hit_source: latest_event_id = hit_source.get('cursor_event_id') return latest_event_id else: es_obj.insert_document(latest_event_index, 'pptx', 'cursor_id', {'cursor_event_id': 0}) logger.info("App Data Index Created Successfully") return False
def query_db(self, query): try: config = utils.config_parser() key_search_rank = config.get("redis", "key_search_history_rank") redis_connect = self.redis_client if redis_connect is not None and redis_connect.ping(): self.connect() keywords_rank = redis_connect.zrange(key_search_rank, 0, -1) words_in_query = query.split() for index, word in enumerate(words_in_query): word1 = '\\b' + word words_in_query[index] = word1 new_query = '\\s+'.join(words_in_query) suggestions = list( utils.clean_text(str(selected_keyword)) for selected_keyword in keywords_rank if re.search( new_query, utils.clean_text(str(selected_keyword)))) return suggestions except: logger.exception("Could not query redis DB") return None
def download_trigger_based_corpus_documents(egnyte_uploaded_files): config = utils.config_parser() destination_local_path = config.get('generic', 'corpus_download_path') egnyte = Egnyte_File_Operations.get_instance() file_relative_path, file_parsing_details = egnyte.download_files_based_on_trigger( egnyte_uploaded_files) return file_relative_path, file_parsing_details
def get_feedback_count_per_user_for_all_documents(user_id): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() user_like_dislike = {} aggregation_query = { "query": { "match": { "userId": { "query": user_id } } }, "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback": { "terms": { "field": "feedback" } } } } } } try: result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) if result: aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] if buckets: for bucket in buckets: doc_id, liked_status, disliked_status = 0, False, False liked_count, disliked_count = 0, 0 doc_id = bucket['key'] inner_bucket = bucket['group_by_feedback']['buckets'] for feedback in inner_bucket: if feedback['key'] == 1: liked_count = feedback['doc_count'] elif feedback['key'] == -1: disliked_count = feedback['doc_count'] if int(liked_count) > int(disliked_count) : liked_status = True elif int(liked_count) < int(disliked_count): disliked_status = True user_like_dislike[doc_id] = [liked_status, disliked_status] return user_like_dislike else: return False else: return False except: return False
def get_feedback_count_for_document(doc_id): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() aggregation_query = { "query": { "match_phrase": { "doc_id": { "query": doc_id } } }, "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback": { "terms": { "field":"feedback" } } } } } } try: result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) if result: aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] if buckets: feedback_count, num_likes, num_dislikes = 0, 0, 0 inner_bucket = buckets[0]['group_by_feedback']['buckets'] for feedback in inner_bucket: feedback_count += feedback['key'] * feedback['doc_count'] if feedback['key'] == 1: num_likes = feedback['doc_count'] elif feedback['key'] == -1: num_dislikes = feedback['doc_count'] return feedback_count, num_likes, num_dislikes else: return 0, 0, 0 else: return 0, 0, 0 except: return 0, 0, 0
def __init__(self, user, password): threading.Thread.__init__(self) self._mines = {} self._wormholes = {} self._user = user self._pass = password self._stop_event = threading.Event() self._config = utils.config_parser(utils.get_config(user, password))
def logout(): config = utils.config_parser() signout_url = config.get('OAuth', 'signout_url') session.clear() response = redirect(signout_url) response.delete_cookie("user_name") response.delete_cookie("email") return response
def log_file_download_event(log_data): config = utils.config_parser() file_download_index = config.get('elasticsearch', 'download_logs_index_name') elastic_obj = elasticsearch_connector.get_instance() result = elastic_obj.insert_document(file_download_index, "pptx", None, log_data) return result
def log_autosuggest_feedback_event(log_data): config = utils.config_parser() autosuggest_feedback_index = config.get( 'elasticsearch', 'autosuggest_feedback_index_name') elastic_obj = elasticsearch_connector.get_instance() result = elastic_obj.insert_document(autosuggest_feedback_index, "pptx", None, log_data) return result
def log_subjective_feedback(payload_data): config = utils.config_parser() subjective_feedback_index = config.get('elasticsearch', 'subjective_feedback_index') elastic_obj = elasticsearch_connector.get_instance() response = elastic_obj.insert_document(subjective_feedback_index, "pptx", None, payload_data) return response
def update_ratings_for_all_docs(): config = utils.config_parser() user_feedback_index = config.get('elasticsearch', 'user_feedback_index_name') corpus_index_name = config.get('elasticsearch', 'corpus_index_name') doc_type = config.get('elasticsearch', 'doc_type') elastic_obj = elasticsearch_connector.get_instance() aggregation_query = { "aggs": { "user_feedback_aggregation": { "terms": { "field": "doc_id.keyword", "size": 10000 }, "aggs": { "group_by_feedback":{ "terms": { "field": "feedback" } } } } } } result = elastic_obj.generic_search_query(user_feedback_index, aggregation_query) aggregations = result["aggregations"] buckets = aggregations["user_feedback_aggregation"]["buckets"] for item in buckets: key = item['key'] feedback_count, num_likes, num_dislikes = 0, 0, 0 inner_bucket = item['group_by_feedback']['buckets'] for feedback in inner_bucket: feedback_count += feedback['key'] * feedback['doc_count'] if feedback['key'] == 1: num_likes = feedback['doc_count'] elif feedback['key'] == -1: num_dislikes = feedback['doc_count'] ratings = { "script" : { "source": "ctx._source.ratings = params.ratings; ctx._source.num_likes = params.num_likes; ctx._source.num_dislikes = params.num_dislikes", "lang": "painless", "params": { "ratings": feedback_count, "num_likes": num_likes, "num_dislikes": num_dislikes } } } result = elastic_obj.update_document(corpus_index_name, doc_type, key, ratings) if result: logger.info("Aggregated ratings updated on corpus index") else: logger.error("Could not aggregate ratings on corpus index")
def main(): # access_token = connect_and_get_access_token() # print(access_token) egnyte_connect = egnyte_connector() eg_config = utils.config_parser() access_token = eg_config.get('egnyte', 'access_token') # Connect to egnyte using domain and access token client = egnyte.EgnyteClient({ "domain": "xoriant.egnyte.com", "access_token": access_token })
def backup(backup_thumbnail="True"): try: date = str(datetime.today().date()) config = utils.config_parser() host = config.get('elasticsearch', 'host') port = config.get('elasticsearch', 'port') es_dump_auth = config.get('elasticsearch', 'es_dump_auth') index_list = config.get('elasticsearch', 'backup_indices') gcs_bucket_name = config.get('elasticsearch', 'gcs_bucket_name') backup_to_gcs = config.get('elasticsearch', 'backup_to_gcs') thumbnail_path = config.get('generic', 'thumbnail_path') input_ip = 'https://' + host + ":" + port output_path = "./esbackup_" + date if not os.path.exists(output_path): os.mkdir(output_path) else: output_path = output_path + datetime.now().strftime( "_%H-%M-%S") os.mkdir(output_path) if backup_thumbnail == "True": thumbnail_output_path = output_path + '/thumbnail' if os.path.isdir(thumbnail_output_path): shutil.rmtree(thumbnail_output_path) logger.info("Creating backup of thumbnails...") try: shutil.copytree(thumbnail_path, thumbnail_output_path) except OSError as e: # If the error was caused because the source wasn't a directory if e.errno == errno.ENOTDIR: shutil.copy(thumbnail_path, output_path) else: logger.exception('Directory not copied. Error: %s' % e) if os.path.exists(thumbnail_output_path): shutil.make_archive(thumbnail_output_path, 'zip', thumbnail_output_path) if os.path.exists(thumbnail_output_path): shutil.rmtree(thumbnail_output_path) else: logger.info("Thumbnails backup is not created.") for index in index_list.split(','): logger.info("Creating es_backup for index %s" % index) command = "NODE_TLS_REJECT_UNAUTHORIZED=0 elasticdump --input=" + input_ip + "/" + index + " --headers '{\"Authorization\":\"Basic " + es_dump_auth + "\"}'" + " --output=" + output_path + "/" + index + ".json --type=data" os.system(command) if backup_to_gcs == 'true': os.system("gsutil cp -r " + output_path + " gs://" + gcs_bucket_name) except Exception as e: logger.exception("Elasticsearch es_backup failed")
def index_specific_on_schedule(): config = utils.config_parser() job_enable = config.get("scheduler", "enable").lower() if job_enable == "true": try: logger.info("Starting the cron job") res = corpus_indexer.index_based_on_event() logger.info("Index Based on Event Cron job Done") except Exception as e: logger.exception("Index for specific file using schedule Exception Occurred.") else: logger.info("Scheduler job disabled")
def update_download_count_for_document(doc_id, update_query): config = utils.config_parser() corpus_index_name = config.get('elasticsearch', 'corpus_index_name') doc_type = config.get('elasticsearch', 'doc_type') es_obj = elasticsearch_connector.get_instance() try: es_obj.update_document(corpus_index_name, doc_type, doc_id, update_query) return True except: logger.exception("Updating download count is failed for %s " % (doc_id)) return False
def get_recently_added_documents(is_authenticated, is_authorized): # This endpoint will get recently added documents from corpus index based on indexing_time parameter if not is_authorized: return render_template("unauthorized_user.html"), 401 config = utils.config_parser() corpus_index = config.get('elasticsearch', 'corpus_index_name') es_obj = elasticsearch_connector.get_instance() recent_documents_name_id = [] recently_added_documents = { "_source": ["source_path", "file_name", "title", "indexing_time"], "query": { "match_all": {} }, "sort": [{ "indexing_time": { "order": "desc" } }] } response = es_obj.generic_search_query(corpus_index, recently_added_documents, size=10) if response: for hits in response['hits']['hits']: recent_data = {} hits_source = hits.get('_source') recent_data['doc_id'] = hits.get('_id') recent_data['source_path'] = hits_source.get('source_path') recent_data['file_name'] = hits_source.get('file_name') title = re.sub(r'^\b(Xoriant )', '', hits_source.get('title'), flags=re.IGNORECASE).strip() recent_data['title'] = title recent_data['indexing_time'] = hits_source.get('indexing_time') recent_documents_name_id.append(recent_data) if recent_documents_name_id: return Response(json.dumps(recent_documents_name_id), status=200, mimetype='application/json') else: return Response(json.dumps( {'failure': 'Error in getting recently added docuemnts'}), status=400, mimetype='application/json')
def connect(self): redis_connect = self.redis_client if redis_connect is not None and redis_connect.ping(): logger.info('Already connected to redis DB') else: config = utils.config_parser() redis_host = config.get("redis", "host") redis_password = config.get("redis", "password") redis_port = config.get("redis", "port") r_obj = redis.Redis(host=redis_host, port=redis_port, db=0, password=redis_password) logger.info("Successfully connected to Redis DB") self.redis_client = r_obj
def store_event_id_in_index(self, latest_parsed_event_id): config = utils.config_parser() latest_event_index = config.get('elasticsearch', 'app_data_index') es_obj = elasticsearch_connector.get_instance() update_event_id = { "script": { "source": "ctx._source.cursor_event_id = params.cursor_event_id", "params": { "cursor_event_id": latest_parsed_event_id } } } es_obj.update_document(latest_event_index, 'pptx', 'cursor_id', update_event_id)
def reset_ratings_likes_dislikes_for_all_indexed_docs(): config = utils.config_parser() index_name = config.get("elasticsearch", "corpus_index_name") doc_type = config.get("elasticsearch", "doc_type") reset_query={ "script": { "source": "ctx._source.ratings = 0; ctx._source.num_likes = 0; ctx._source.num_dislikes = 0", "lang": "painless" }, "query": { "match_all": {} } } es_obj = elasticsearch_connector.get_instance() return es_obj.update_index_by_query(index_name, doc_type, reset_query)
def get_topics(is_authenticated, is_authorized): # This endpoint will return trending top 10 topics if not is_authorized: return render_template("unauthorized_user.html"), 401 config = utils.config_parser() topic_list = config.get('trending_topics', 'topic_list').split(",") if topic_list: return Response(json.dumps({'topics': topic_list}), status=200, mimetype='application/json') else: return Response(json.dumps({'failure': 'No topics found'}), status=204, mimetype='application/json')