class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("errors"): print("es search error") return return rsp except Exception as e: print("es search error: " + str(e)) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index)
import pandas as pd from elasticsearch5 import Elasticsearch #pid = 2337 es = Elasticsearch(hosts=ES_HOST) count = es.count(index="prd_review")['count'] def get_mtermvectors(ids): body = dict() body["ids"] = ids body["parameters"] = {"fields": ["title"]} res = es.mtermvectors(index='prd_review', doc_type='_doc', body=body)['docs'] return res def get_termvectors(id): res = es.termvectors(index='prd_review', doc_type='_doc', id=id)['term_vectors'] if 'title' in res.keys(): return res else: return None def sort_terms_vector(term_vectors): if not term_vectors: return None term_dict = {}
class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) self._multi_search_results = [] self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def delay_index(self, body, index, doc_type): self.bulk_task_queue.append( {"index": { "_index": index, "_type": doc_type }}) self.bulk_task_queue.append(body) if self._can_do_bulk(): self.bulk(body=self.bulk_task_queue, index=index, doc_type=doc_type) self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def _can_do_bulk(self): # 任务队列超过100条数据 if len(self.bulk_task_queue) > 100: return True # 时间间隔超过1分钟 if get_n_min_ago(1) > self.bulk_last_time: return True return False def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: print(body) logger.error("es search error: " + str(e) + index) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index) def put_template(self, name, body, **kwargs): return self.es.indices.put_template(name=name, body=body, **kwargs) def exists_template(self, name, **kwargs) -> bool: return self.es.indices.exists_template(name=name, **kwargs) def delete_template(self, name, **kwargs): return self.es.indices.delete_template(name=name, **kwargs) def get_template(self, name, **kwargs): return self.es.indices.get_template(name=name, **kwargs) def wait_log_in_database(self, computer_name, record_number): """ 因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步 """ count = 0 query = { "query": get_must_statement( get_term_statement("computer_name", computer_name), get_term_statement("record_number", record_number)), "_source": False, "size": 1 } while True: try: rsp = self.es.search(body=query, index=ElasticConfig.event_log_index, doc_type=ElasticConfig.event_log_doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) break if len(rsp["hits"]["hits"]) > 0: return rsp["hits"]["hits"][0]["_id"] time.sleep(2) # 最多等5次,即 2 * 5 = 10秒 if count == 10: break count += 1 except Exception as e: logger.error("es wait_log_in_database search error: " + str(e)) break def multi_search(self, body, index, doc_type): try: rsp = self.es.msearch(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: logger.error("es msearch error: " + str(e))