def run(self): while True: result = self.redis_con.get_yy_rowkey("es:harm:insert:info") logging.info(result) rowkey,type = eval(result) _id = rowkey if type == "WECHAT_INFO_TABLE" or type == "INFO_TABLE" or type == "MONITOR_INFO_TABLE": _id = trans_md5(rowkey) log_info = "表格%s的rowkey的值为:%s" %(type,rowkey) logging.info(log_info) map = self.hbase_con.getResultByRowkey(type, rowkey,HARM_INFO_ZIDUAN[type]) if not map: continue self.es_ping() boo = self.es.exists(HARM_INFO_ZIDUAN[type], "sino", _id) if boo: doc = {"doc": map} log_info = "rowkey值已存在" logging.info(log_info) self.es.update(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=doc) log_info = "%s数据更新成功" %_id logging.info(log_info) else: log_info = "rowkey值:%s不存在" %_id logging.info(log_info) self.es.index(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=map)
def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("forum_info") # logging.info(rowkey) if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] _id = trans_md5(rowkey) boo = self.es.exists("forum_info","sino",_id) if boo: cunzai = cunzai + 1 map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info",param) if not map: continue action_list.append({ "_op_type":"update", "_index": "forum_info", "_type": "sino", "_id": _id, "doc": map, }) else: map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info") if not map: continue action_list.append({ "_index": "forum_info", "_type": "sino", "_id": _id, "_source": map, }) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 if len(action_list) > 0: self.commit(action_list) start = int(time.time()) action_list.clear() count = 0
def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("xw_info") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] _id = trans_md5(rowkey) boo = self.es.exists("xw_info", "sino", _id) action = { "_index": "xw_info", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info",param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai+1 else: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info") if not map: continue self.es.index(index="xw_info",doc_type="sino",id=_id,body=map) self.insert_count = self.insert_count + 1 continue # action['_source'] = map action['_id'] = _id action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 10: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 start = int(time.time()) self.commit(action_list) action_list.clear() count = 0
def run(self): while True: rowkey = self.redis_con.get_yy_rowkey("es:news:del:info") _id = trans_md5(rowkey) self.es_ping() try: boo = self.es.exists(index="xw_info",doc_type="sino",id=_id) if boo: self.es.delete(index="xw_info",doc_type="sino",id=_id) except Exception as e: log_info = "news info delete error %s" %str(e) logging.error(log_info) boo = self.es.exists(index="xw_info", doc_type="sino", id=_id) if boo: self.es.delete(index="xw_info", doc_type="sino", id=_id)
def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: #获取需要同步的redis值 rowkey = self.redis_con.get_rowkey("wx_info") #由于这里无法使用blpop,所以需要通过空值判定 if rowkey == None: #没有进数据时,将累积的需要同步的数据存入elasticsearch if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None #获取rowkey和需要同步的字段 if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] #将hbase的rowkey转化为md5类型数据,存入elasticsearch _id = trans_md5(rowkey) #判定此_id是否存在于elasticsearch,花费时间为30毫秒,实为head请求 boo = self.es.exists("wx_info", "sino", _id) action = { "_index": "wx_info", "_type": "sino", "_id": "", } #如果数据已存在,采取update的方式进行数据上传 if boo: map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE", rowkey, "wx_info", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 #如果数据不存在,采集insert的方式进行数据上传(此时不用去关心是否被限制了字段) else: map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE", rowkey, "wx_info") if not map: continue action['_source'] = map action['_id'] = _id action_list.append(action) end = int(time.time()) count = count + 1 #如果数据量超过COUNT_NUM或者距离上次提交数据的时间超过30秒,则提交数据 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) start = int(time.time()) action_list.clear() count = 0
def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("xw_info_test") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue else: self.redis_con.redis_con.hset("es:news:info:tongji", rowkey.split("|||||")[0], 1) param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] _id = trans_md5(rowkey) boo = self.es.exists("xw_info", "sino", _id) action = { "_index": "xw_info", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 print("修改" + rowkey) else: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info") if not map: continue action['_source'] = map print("新增" + rowkey) self.redis_con.set_rowkey("xw_info_test", rowkey) if map['content_md5']: self.redis_con.redis_con.hset("es:news:info:tongji", rowkey, map['content_md5']) action['_id'] = _id action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 20: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 start = int(time.time()) self.commit(action_list) action_list.clear() count = 0