def main(): baseinit_ins = BaseInit() baseinit_ins.es_index_init() nginx_doc_url = get_config("common", "nginx_doc_url") timeout = int(get_config("common", "timeout")) retry = int(get_config("common", "retry")) interval = int(get_config("common", "interval")) logger.info("Start get url {}".format(nginx_doc_url)) nginx_module_index_page_html = retry_call(get_request_text, fargs=[nginx_doc_url, timeout], tries=retry) nginxpage_ins = NginxPage(nginx_module_index_page_html) nginx_module_names_info = nginxpage_ins.get_module_names() # 存储变量和配置名称的相关数据,为搜索添加数据支撑 keyword_info = [] for item in nginx_module_names_info: module_name = item.split("/")[-1].split(".")[0] if module_name.startswith("ngx_") and "http_api" not in module_name: nginx_module_ins = NginxModule(item) nginx_module_ins.handle_module_direct_info() nginx_module_ins.handle_module_vars_info() nginx_module_ins.save_module_info_to_es() keyword_info.extend(nginx_module_ins.keyword_info) time.sleep(interval) baseinit_ins.es_ins.insert_mul_index_data("keyword", keyword_info)
def get_module_variable_location(self): # 获取variables指令的a标签,方便定位操作 logger.info("获取variables指令的a标签,方便定位操作") nginx_module_variable_info = self.d("a").filter( lambda i, this: pq(this).attr("name") == "variables" ) return nginx_module_variable_info
def delete_index_data_by_id(self, index_name, id, index_type="_doc"): """ 删除索引中的一条 :param id: :return: """ res = self.es.delete(index=index_name, doc_type=index_type, id=id) logger.info(res)
def create_index(self, index_name, map_body=None): if not self.es.indices.exists(index=index_name): if map_body: res = self.es.indices.create(index=index_name, body=map_body) else: res = self.es.indices.create(index=index_name) logger.info(res)
def get_module_item_location(self): # 获取每个菜单指令的a标签,方便定位操作 logger.info("获取每个菜单指令的a标签,方便定位操作") nginx_module_item = self.d("a").filter( lambda i, this: pq(this).attr("name") is not None ) return nginx_module_item
def insert_one_index_data(self, index_name, index_data, index_type="_doc"): """ 数据存储到es :return: """ res = self.es.index(index=index_name, doc_type=index_type, body=index_data) logger.info(res)
def delete_index_data_by_query(self, index_name, query_body): """ 删除query_body查询出的所有内容 :param index_name: index_name :param index_type: index_type :param query_body: es query :return: """ res = self.es.delete_by_query(index=index_name, body=query_body) logger.info(res)
def get_module_names(self): # 获取nginx的所有模块 logger.info("获取所有模块页面列表") nginx_module_locations = self.d("center").filter( lambda i, this: pq(this).children("h4").text() == "Modules reference" ) return [ item.attr("href") for item in nginx_module_locations.nextAll("ul").find("a").items() ]
def get_data_by_id(self, index_name, id, index_type="_doc"): res = self.es.get(index=index_name, doc_type=index_type, id=id) logger.info(res["_source"]) # 输出查询到的结果 for hit in res["hits"]["hits"]: # logger.info hit['_source'] logger.info( hit["_source"]["date"], hit["_source"]["source"], hit["_source"]["link"], hit["_source"]["keyword"], hit["_source"]["title"], )
def insert_mul_index_data(self, index_name, mul_index_data, index_type="_doc"): """ 用bulk将批量数据存储到es :return: """ ACTIONS = [] for line in mul_index_data: action = { "_index": index_name, "_type": index_type, "_source": line } ACTIONS.append(action) # 批量处理 success, _ = bulk(self.es, ACTIONS, index=index_name, raise_on_error=True) logger.info("Performed %d actions" % success)
def get_request_text(url, timeout): r = requests.get(url, timeout=timeout) logger.info("get url: {} successfully...".format(url)) return r.text
def __init__(self): self.keyword_map = { "mappings": { "properties": { "keyword": { "type": "text" }, "module_name": { "type": "text" }, } } } self.direct_retain_keyword = [ "endpoints", "arguments", "summary", "issues", "example", "directives", "compatibility", "definitions", "protocol", "variables", "commands", "data", "compatibility", "properties", ] self.module_map = { "mappings": { "properties": { "module_name": { "type": "text" }, "compatibility": { "type": "text" }, "properties": { "type": "text" }, "arguments": { "type": "text" }, "definitions": { "type": "text" }, "protocol": { "type": "text" }, "commands": { "type": "text" }, "data": { "type": "text" }, "summary": { "type": "text" }, "variables": { "type": "object" }, "issues": { "type": "text" }, "example": { "type": "text" }, "endpoints": { "type": "nested" }, "directives": { "type": "text" }, "directive_info": { "type": "nested" }, } } } logger.info("初始化es实例") self.es_ins = ElasticSearch( ips=get_config("elasticsearch", "ips"), port=get_config("elasticsearch", "port"), )
def delete_index(self, index_name): # 删除索引 if self.es.indices.exists(index=index_name): self.es.indices.delete(index=index_name) logger.info("删除索引{}成功".format(index_name))