def set_alias(self, alias_name, index_name): indices = IndicesClient(self.elastic) old_index = alias_name + '_20*' try: indices.delete_alias(index=old_index, name=alias_name) except NotFoundError: pass except Exception as e: logger.error("Alias %s for %s not created: %s", alias_name, index_name, str(e)) return try: indices.put_alias(index=index_name, name=alias_name, body={}) except Exception as e: logger.error("Alias %s for %s not created: %s", alias_name, index_name, str(e)) return logger.info("Set alias %s -> %s", alias_name, index_name)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-host", action="store", default="127.0.0.1", type=unicode, help="Source host [default: %(default)s]") parser.add_argument("--src-port", action="store", default=9200, help="Source port [default: %(default)s]") parser.add_argument("--src-index", action="store", default="", type=unicode, help="Source index") parser.add_argument("--src-batch-size", action="store", type=int, default=5000, help="Source query batchsize [default: %(default)s]") parser.add_argument("--src-scroll-interval", action="store", type=unicode, default="60m", help="Interval for source scroll query [default: %(default)s]") parser.add_argument("--dest-host", action="store", default="127.0.0.1", type=unicode, help="Destination host [default: %(default)s]") parser.add_argument("--dest-port", action="store", default=9200, help="Destination port [default: %(default)s]") parser.add_argument("--dest-index", action="store", default="", type=unicode, help="Destination index") parser.add_argument("--dest-batch-size", action="store", type=int, default=5000, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-alias", action="store", help="Destination index alias (to be set after we have finished populating)") parser.add_argument("--dest-concurrency", action="store", type=int, default=4, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-delete-index", action="store_true", help="Delete destination index at before starting") parser.add_argument("--query", action="store", type=unicode, default="", help="Query to use [if None is specified, a match_all will be used]") args = parser.parse_args() if args.src_index is None or len(args.src_index) == 0: raise Exception("--src-index must be specified!") if args.dest_index is None or len(args.dest_index) == 0: raise Exception("--dest-index must be specified!") dt_start = datetime.now() # copy mapping src_es_instance = get_elasticsearch(args.src_host, args.src_port) dest_es_instance = get_elasticsearch(args.dest_host, args.dest_port) # check if src_index exists src_es_ic = IndicesClient(src_es_instance) if not src_es_ic.exists(args.src_index): raise Exception("--src-index %s does not exist!" % args.src_index) # check if dest_index exists dest_es_ic = IndicesClient(dest_es_instance) if dest_es_ic.exists(args.dest_index): if args.dest_delete_index: dest_es_ic.delete(index=args.dest_index) else: raise Exception("--dest-index %s already exists! Use --dest-delete-index if you want to drop it" % args.dest_index) log.info("Copying mapping...") # copy mapping over to dest src_index_information = src_es_ic.get(index=args.src_index) dest_es_ic.create(index=args.dest_index, body=src_index_information.get(args.src_index, {})) # set num_of_replicas to 0 dest_es_ic.put_settings(index=args.dest_index, body={"settings": {"index": {"number_of_replicas": 0}}}) # perform multiprocessing log.info("Copying data...") MAGIC_STRING = "%s:%s" % (str(uuid4()), str(uuid4())) DEST_QUEUE = Queue() DEST_COUNTER = Value('i', 0) src_process = Process(target=src_worker, args=(args, DEST_QUEUE, MAGIC_STRING)) src_process.start() dest_processes = [Process(target=dest_worker, args=(args, DEST_QUEUE, MAGIC_STRING, DEST_COUNTER)) for i in xrange(args.dest_concurrency)] for i in dest_processes: i.start() src_process.join() for i in dest_processes: i.join() log.info("[dest_worker] Total processed %s" % DEST_COUNTER.value) if args.dest_alias is not None and len(args.dest_alias) > 0: # we remove all existing mappings to this alias, then add it to the current dest_index for idx_name, aliases_mapping in dest_es_ic.get_aliases().iteritems(): if args.dest_alias in aliases_mapping.get("aliases", {}): dest_es_ic.delete_alias(index=idx_name, name=args.dest_alias) dest_es_ic.put_alias(index=args.dest_index, name=args.dest_alias) dest_es_ic.refresh(args.dest_index) dt_end = datetime.now() log.info("Time elapsed: %s" % (dt_end-dt_start, ))
class UpdateService(object): """ 全量更新的service """ def __init__(self, taskName, handlerConfigPath): self.handlerConfigPath = handlerConfigPath self.redisClient = remote.getRedisClient() self.esClient = remote.getElasticClient() self.esIndexClient = IndicesClient(self.esClient) # instance of RedisStatus self.statusKey = taskName self.status = None # classname of handler self.handlerClassName = 'modules.handlers.v1.CommonHandler' def update(self): """ update action. """ # 检查status,看是否可以进行全量更新 if not self._checkStatus(): return False # 更新status,标明准备开始全量更新 self._switchStatus() # 将新的消费者的offset设置为 latest self._copyKafkaOffset() # 从mysql读取数据,同步到ES中 syncResult = self._syncFromMysqlToEs() if syncResult: # 从kafka队列中消费数据,直至赶上老的消费者 self._catchupwithCurrentConsumer() # 移除老的es index的alias,增加新的es index的alias self._setESIndexAlias() # 修改status self._resetStatus(True) else: # 修改status self._resetStatus(False) # 清理脏数据 self._cleanDirtyData() return syncResult def reset(self): """ 重新设置status,清除脏数据 """ status = RedisStatus(self.statusKey) status.sync() self.status = status self._restoreESIndexAlias() self._resetStatus(False) self._cleanDirtyData() def clean(self): """ 清除脏数据 """ self._cleanDirtyData() def _cleanDirtyData(self): """ 清除发生异常时导致的脏数据。 包括:无用的statusconfig, 无用的es index """ configKeys = self.redisClient.keys('CK_%s_*' % self.statusKey) status = RedisStatus(self.statusKey) status.sync() liveKeys = [status.config, status.nextConfig, status.tmpConfig] for key in configKeys: if key not in liveKeys: self._deleteStatusConfig(key, deleteIndices=True) elif key == status.tmpConfig: self._deleteStatusConfig(key, deleteIndices=False, deleteAlias=True) def _fail(self, errinfo): _logger.error(errinfo) print(errinfo, file=sys.stderr) def _checkStatus(self): status = RedisStatus(self.statusKey) status.sync() self.status = status if status.code != STATUS_INITIAL and status.code != STATUS_INCRE_UPDATING: self._fail('status code [%s] MUST be in values array [%s, %s]' % (status, STATUS_INITIAL, STATUS_INCRE_UPDATING)) return False else: return True def _loadConfig(self): """ 从conf/handlers/目录中加载handlers相关的配置, 此外,还会生成一个新的consumer_group_id以及一个新的es_index, 这些数据都写入redis """ configKey = 'CK_%s_%s' % (self.statusKey, app.getUuid('update.loadconfig.gen.config_key')) kafkaGroupId = app.getUuid('update.loadconfig.gen.kafka_group_id') esIndexSuffix = str(int(time.time())) + '_' + str(random.randint(0, 100000)) handlerConfig = HandlerConfig() handlerConfig.loadFromFile(self.handlerConfigPath) statusConfig = RedisStatusConfig(configKey) statusConfig.kafkaGroupId = kafkaGroupId statusConfig.esIndexSuffix = esIndexSuffix statusConfig.handlerConfig = handlerConfig statusConfig.set() _logger.info('load config: %s', statusConfig) return statusConfig def _acquireDLock(self): return self.redisClient.acquireDLock(_DISTRIBUTE_REDIS_LOCK_PREFIX + self.statusKey) def _releaseDLock(self, lock): return self.redisClient.releaseDLock(lock) def _switchStatus(self): """ 更新status,标明准备开始全量更新 """ lock = None try: # 创建全量更新时所需的statusConfig statusConfig = self._loadConfig() lock = self._acquireDLock() # check the stauts again after get the lock if not self._checkStatus(): return False self.status.code = STATUS_FULL_UPDATING self.status.nextConfig = statusConfig.key self.status.update() except Exception as e: _logger.info('fail to switch status to full updating: %s', e) raise finally: if lock: self._releaseDLock(lock) def _copyKafkaOffset(self): """ 将新的消费者的offset设置为 latest """ # 首先要获取kafka topic的所有分区 topicName = config().get('kafka', 'topic') if self.status.nextConfig: nextStatusConfig = RedisStatusConfig(self.status.nextConfig, forceSync=True) try: nextConsumer = remote.getKafkaConsumer( nextStatusConfig.kafkaGroupId, autoCommit=False, autoOffsetReset='latest' ) _logger.debug('next kafka groupid is: %s', nextStatusConfig.kafkaGroupId) clusterMetadata = nextConsumer.list_topics(topicName) topicMetadata = clusterMetadata.topics.get(topicName, {}) partitions = topicMetadata.partitions for pid in partitions.keys(): p = TopicPartition(topicName, pid) nextConsumer.assign([p]) msg = nextConsumer.poll(10) if msg: offset = msg.offset() - 1 _logger.debug('pid[%s] topic[%s] offset[%s]', pid, topicName, offset) if offset >= 0: p.offset = offset nextConsumer.commit(offsets=[p]) except Exception as e: _logger.error('exception occurs when setting offset for new consumer: %s', Failure()) raise finally: if nextConsumer: nextConsumer.close() def _syncFromMysqlToEs(self): nextStatusConfig = RedisStatusConfig(self.status.nextConfig, forceSync=True) # 从mysql中全量更新 try: handlerClass = utils.classForName(self.handlerClassName) commonHandler = handlerClass(nextStatusConfig) commonHandler.syncFromMySQL() return True except Exception as e: _logger.error('exception occurs when syncFromMySQL: %s', Failure()) return False def _catchupwithCurrentConsumer(self): """ 取新的consumer_group_id,从kafka队列中消费数据。 直到新的消费者赶上旧的消费者。 """ # TODO def _setESIndexAlias(self): """ 移除老的es index的alias,增加新的es index的alias """ configKey = self.status.config if configKey: statusConfig = RedisStatusConfig(configKey, forceSync=True) self._removeESIndexAlias(statusConfig) nextConfigKey = self.status.nextConfig if nextConfigKey: nextStatusConfig = RedisStatusConfig(nextConfigKey, forceSync=True) self._addESIndexAlias(nextStatusConfig) def _restoreESIndexAlias(self): """ 恢复老的es index的alias,删除新的es index的alias """ nextConfigKey = self.status.nextConfig if nextConfigKey: nextStatusConfig = RedisStatusConfig(nextConfigKey, forceSync=True) self._removeESIndexAlias(nextStatusConfig) configKey = self.status.config if configKey: statusConfig = RedisStatusConfig(configKey, forceSync=True) self._addESIndexAlias(statusConfig) def _removeESIndexAlias(self, statusConfig): """ 移除指定es index的alias """ esIndexSuffix = statusConfig.esIndexSuffix indices = statusConfig.handlerConfig.indices() for indexAlias in indices: indexName = indexAlias + '_' + esIndexSuffix try: self.esIndexClient.delete_alias( index=indexName, name=indexAlias ) except NotFoundError as e: _logger.error('index[%s] not found: %s', indexName, e) def _addESIndexAlias(self, statusConfig): """ 移除指定es index的alias """ esIndexSuffix = statusConfig.esIndexSuffix indices = statusConfig.handlerConfig.indices() for indexAlias in indices: indexName = indexAlias + '_' + esIndexSuffix try: self.esIndexClient.put_alias( index=indexName, name=indexAlias ) except NotFoundError as e: _logger.error('index[%s] not found: %s', indexName, e) def _resetStatus(self, succeeded): """ 重置status,标明全量更新完毕,开始进行增量更新 """ try: lock = self._acquireDLock() if succeeded: # after succeeded self.status.code = STATUS_INCRE_UPDATING self.status.tmpConfig = self.status.config self.status.config = self.status.nextConfig self.status.nextConfig = '' self.status.update() else: # after failed self.status.code = STATUS_INCRE_UPDATING self.status.tmpConfig = self.status.nextConfig self.status.nextConfig = '' self.status.update() # if failed, delete dirty data immediately self._deleteStatusConfig(self.status.tmpConfig, deleteIndices=True) except Exception as e: _logger.info('fail to reset status to incre-updating: %s', Failure()) raise finally: if lock: self._releaseDLock(lock) def _deleteStatusConfig(self, configKey, deleteIndices=False, deleteAlias=False): # 删除stats config if configKey: statusConfig = RedisStatusConfig(configKey, forceSync=True) if deleteIndices: handlerConfig = statusConfig.handlerConfig for indexAlias in handlerConfig.indices(): indexName = indexAlias + '_' + statusConfig.esIndexSuffix if self.esIndexClient.exists(index=indexName): self.esIndexClient.delete( index=indexName ) elif deleteAlias: # 只删除alias,不删除index self._removeESIndexAlias(statusConfig) # 在redis中删除status config # 设置为1天后过期 statusConfig.delete(24 * 3600) def _removeNextEsIndex(self): """ 删除新的es index """ # TODO def _removeNextStatusConfig(self): """
def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-host", action="store", default="127.0.0.1", type=str, help="Source host [default: %(default)s]") parser.add_argument("--src-port", action="store", default=9200, help="Source port [default: %(default)s]") parser.add_argument("--src-index", action="store", default="", type=str, help="Source index") parser.add_argument("--src-batch-size", action="store", type=int, default=5000, help="Source query batchsize [default: %(default)s]") parser.add_argument( "--src-scroll-interval", action="store", type=str, default="60m", help="Interval for source scroll query [default: %(default)s]") parser.add_argument("--dest-host", action="store", default="127.0.0.1", type=str, help="Destination host [default: %(default)s]") parser.add_argument("--dest-port", action="store", default=9200, help="Destination port [default: %(default)s]") parser.add_argument("--dest-index", action="store", default="", type=str, help="Destination index") parser.add_argument("--dest-batch-size", action="store", type=int, default=5000, help="Destination batchsize [default: %(default)s]") parser.add_argument( "--dest-alias", action="store", help= "Destination index alias (to be set after we have finished populating)" ) parser.add_argument("--dest-concurrency", action="store", type=int, default=4, help="Destination batchsize [default: %(default)s]") parser.add_argument("--dest-delete-index", action="store_true", help="Delete destination index at before starting") parser.add_argument( "--query", action="store", type=str, default="", help="Query to use [if None is specified, a match_all will be used]") args = parser.parse_args() if args.src_index is None or len(args.src_index) == 0: raise Exception("--src-index must be specified!") if args.dest_index is None or len(args.dest_index) == 0: raise Exception("--dest-index must be specified!") dt_start = datetime.now() # copy mapping src_es_instance = get_elasticsearch(args.src_host, args.src_port) dest_es_instance = get_elasticsearch(args.dest_host, args.dest_port) # check if src_index exists src_es_ic = IndicesClient(src_es_instance) if not src_es_ic.exists(args.src_index): raise Exception("--src-index %s does not exist!" % args.src_index) # check if dest_index exists dest_es_ic = IndicesClient(dest_es_instance) if dest_es_ic.exists(args.dest_index): if args.dest_delete_index: dest_es_ic.delete(index=args.dest_index) else: raise Exception( "--dest-index %s already exists! Use --dest-delete-index if you want to drop it" % args.dest_index) log.info("Copying mapping...") # copy mapping over to dest src_index_information = src_es_ic.get(index=args.src_index) dest_es_ic.create(index=args.dest_index, body=src_index_information.get(args.src_index, {})) # set num_of_replicas to 0 dest_es_ic.put_settings( index=args.dest_index, body={"settings": { "index": { "number_of_replicas": 0 } }}) # perform multiprocessing log.info("Copying data...") MAGIC_STRING = "%s:%s" % (str(uuid4()), str(uuid4())) DEST_QUEUE = Queue() DEST_COUNTER = Value('i', 0) src_process = Process(target=src_worker, args=(args, DEST_QUEUE, MAGIC_STRING)) src_process.start() dest_processes = [ Process(target=dest_worker, args=(args, DEST_QUEUE, MAGIC_STRING, DEST_COUNTER)) for i in xrange(args.dest_concurrency) ] for i in dest_processes: i.start() src_process.join() for i in dest_processes: i.join() log.info("[dest_worker] Total processed %s" % DEST_COUNTER.value) if args.dest_alias is not None and len(args.dest_alias) > 0: # we remove all existing mappings to this alias, then add it to the current dest_index for idx_name, aliases_mapping in dest_es_ic.get_aliases().iteritems(): if args.dest_alias in aliases_mapping.get("aliases", {}): dest_es_ic.delete_alias(index=idx_name, name=args.dest_alias) dest_es_ic.put_alias(index=args.dest_index, name=args.dest_alias) dest_es_ic.refresh(args.dest_index) dt_end = datetime.now() log.info("Time elapsed: %s" % (dt_end - dt_start, ))