Exemplo n.º 1
0
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise RuntimeError('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise RuntimeError('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)
        if not isinstance(self._conf.dst_conf, MongoConfig):
            raise RuntimeError('invalid dst config type')
        self._dst = MongoHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise RuntimeError('connect to mongodb(dst) failed: %s' % self._conf.dst_hostportstr)
        self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10)
Exemplo n.º 2
0
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None
Exemplo n.º 3
0
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None
Exemplo n.º 4
0
class MongoSyncer(CommonSyncer):
    """ MongoDB synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise RuntimeError('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise RuntimeError('connect to mongodb(src) failed: %s' %
                               self._conf.src_hostportstr)
        if not isinstance(self._conf.dst_conf, MongoConfig):
            raise RuntimeError('invalid dst config type')
        self._dst = MongoHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise RuntimeError('connect to mongodb(dst) failed: %s' %
                               self._conf.dst_hostportstr)
        self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10)

    def _create_index(self, namespace_tuple):
        """ Create indexes.
        """
        def format(key_direction_list):
            """ Format key and direction of index.
            """
            res = []
            for key, direction in key_direction_list:
                if isinstance(direction, float) or isinstance(direction, long):
                    direction = int(direction)
                res.append((key, direction))
            return res

        dbname, collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname)
        index_info = self._src.client()[dbname][collname].index_information()
        for name, info in index_info.iteritems():
            keys = info['key']
            options = {}
            options['name'] = name
            if 'unique' in info:
                options['unique'] = info['unique']
            if 'sparse' in info:
                options['sparse'] = info['sparse']
            if 'expireAfterSeconds' in info:
                options['expireAfterSeconds'] = info['expireAfterSeconds']
            if 'partialFilterExpression' in info:
                options['partialFilterExpression'] = info[
                    'partialFilterExpression']
            if 'dropDups' in info:
                options['dropDups'] = info['dropDups']

            ## create indexes before import documents, so not need 'background' option
            # if 'background' in info:
            #     options['background'] = info['background']

            # for text index
            if 'weights' in info:
                options['weights'] = info['weights']
            if 'default_language' in info:
                options['default_language'] = info['default_language']
            if 'language_override' in info:
                options['language_override'] = info['language_override']

            self._dst.create_index(dst_dbname, dst_collname, format(keys),
                                   **options)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        # create indexes first
        self._create_index(namespace_tuple)

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(
            src_dbname, src_collname)
        src_ns = '%s.%s' % (src_dbname, src_collname)

        total = self._src.client()[src_dbname][src_collname].count()
        self._progress_logger.register(src_ns, total)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=None,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    modifiers={'$snapshot': True})

                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10
                n = 0

                for doc in cursor:
                    reqs.append(
                        pymongo.ReplaceOne({'_id': doc['_id']},
                                           doc,
                                           upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write,
                                         dst_dbname,
                                         dst_collname,
                                         groups[i],
                                         ordered=False,
                                         ignore_duplicate_key_error=True)
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads, raise_error=True)
                        groups = []

                    n += 1
                    if n % 10000 == 0:
                        self._progress_logger.add(src_ns, n)
                        n = 0

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write,
                                     dst_dbname,
                                     dst_collname,
                                     groups[i],
                                     ordered=False,
                                     ignore_duplicate_key_error=True)
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads, raise_error=True)
                if len(reqs) > 0:
                    self._dst.bulk_write(dst_dbname,
                                         dst_collname,
                                         reqs,
                                         ordered=False,
                                         ignore_duplicate_key_error=True)

                self._progress_logger.add(src_ns, n, done=True)
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _sync_large_collection(self, namespace_tuple, split_points):
        """ Sync large collection.
        """
        # create indexes first
        self._create_index(namespace_tuple)

        dbname, collname = namespace_tuple
        ns = '.'.join(namespace_tuple)

        log.info('pending to sync %s with %d processes' %
                 (ns, len(split_points) + 1))

        coll = self._src.client()[dbname][collname]
        total = coll.count()
        self._progress_logger.register(ns, total)

        prog_q = multiprocessing.Queue()
        res_q = multiprocessing.Queue()

        proc_logging = multiprocessing.Process(target=logging_progress,
                                               args=(ns, total, prog_q))
        proc_logging.start()

        queries = []
        lower_bound = None
        for point in split_points:
            if lower_bound is None:
                queries.append({'_id': {'$lt': point}})
            else:
                queries.append({'_id': {'$gte': lower_bound, '$lt': point}})
            lower_bound = point
        queries.append({'_id': {'$gte': lower_bound}})

        procs = []
        for query in queries:
            p = multiprocessing.Process(
                target=self._sync_collection_with_query,
                args=(namespace_tuple, query, prog_q, res_q))
            p.start()
            procs.append(p)
            log.info('start process %s with query %s' % (p.name, query))

        for p in procs:
            p.join()

        n_docs = 0
        for p in procs:
            n_docs += res_q.get()
        self._progress_logger.add(ns, n_docs, done=True)

        prog_q.put(True)
        prog_q.close()
        prog_q.join_thread()
        proc_logging.join()

    def _sync_collection_with_query(self, namespace_tuple, query, prog_q,
                                    res_q):
        """ Sync collection with query.
        """
        self._src.reconnect()
        self._dst.reconnect()

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(
            src_dbname, src_collname)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=query,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    # snapshot cause blocking, maybe bug
                    # modifiers={'$snapshot': True}
                )
                total = 0
                n = 0
                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10

                for doc in cursor:
                    reqs.append(
                        pymongo.ReplaceOne({'_id': doc['_id']},
                                           doc,
                                           upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write,
                                         dst_dbname,
                                         dst_collname,
                                         groups[i],
                                         ordered=False,
                                         ignore_duplicate_key_error=True)
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads, raise_error=True)
                        groups = []

                    n += 1
                    total += 1
                    if n % 10000 == 0:
                        prog_q.put(n)
                        n = 0

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write,
                                     dst_dbname,
                                     dst_collname,
                                     groups[i],
                                     ordered=False,
                                     ignore_duplicate_key_error=True)
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads, raise_error=True)
                if len(reqs) > 0:
                    self._dst.bulk_write(dst_dbname,
                                         dst_collname,
                                         reqs,
                                         ordered=False,
                                         ignore_duplicate_key_error=True)

                if n > 0:
                    prog_q.put(n)
                res_q.put(total)

                prog_q.close()
                prog_q.join_thread()
                res_q.close()
                res_q.join_thread()
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, start_optime):
        """ Replay oplog.
        """
        self._last_optime = start_optime

        n_total = 0
        n_skip = 0

        while True:
            try:
                start_optime_valid = False
                need_log = False
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' %
                         (self._last_optime, host, port))
                cursor = self._src.tail_oplog(start_optime)
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % self._last_optime)
                return
            except Exception as e:
                log.error('get oplog cursor failed: %s' % e)
                continue

            # loop: read and apply oplog
            while True:
                try:
                    if need_log:
                        self._log_optime(self._last_optime)
                        self._log_progress()
                        need_log = False

                    if not cursor.alive:
                        log.error('cursor is dead')
                        raise pymongo.errors.AutoReconnect

                    oplog = cursor.next()
                    n_total += 1

                    # check start optime once
                    if not start_optime_valid:
                        if oplog['ts'] == self._last_optime:
                            log.info('oplog is ok: %s' % self._last_optime)
                            start_optime_valid = True
                        else:
                            log.error('oplog %s is stale, terminate' %
                                      self._last_optime)
                            return

                    if oplog['op'] == 'n':  # no-op
                        self._last_optime = oplog['ts']
                        need_log = True
                        continue

                    # validate oplog
                    if not self._conf.data_filter.valid_oplog(oplog):
                        n_skip += 1
                        self._last_optime = oplog['ts']
                        need_log = True
                        continue

                    dbname, collname = mongo_utils.parse_namespace(oplog['ns'])
                    dst_dbname, dst_collname = self._conf.db_coll_mapping(
                        dbname, collname)
                    if dst_dbname != dbname or dst_collname != collname:
                        oplog['ns'] = '%s.%s' % (dst_dbname, dst_collname)

                    if self._stage == Stage.post_initial_sync:
                        if self._multi_oplog_replayer:
                            if mongo_utils.is_command(oplog):
                                self._multi_oplog_replayer.apply(
                                    ignore_duplicate_key_error=True)
                                self._multi_oplog_replayer.clear()
                                self._dst.apply_oplog(oplog)
                                self._last_optime = oplog['ts']
                                need_log = True
                            else:
                                self._multi_oplog_replayer.push(oplog)
                                if oplog[
                                        'ts'] == self._initial_sync_end_optime or self._multi_oplog_replayer.count(
                                        ) == self._oplog_batchsize:
                                    self._multi_oplog_replayer.apply(
                                        ignore_duplicate_key_error=True)
                                    self._multi_oplog_replayer.clear()
                                    self._last_optime = oplog['ts']
                                    need_log = True
                        else:
                            self._dst.apply_oplog(
                                oplog, ignore_duplicate_key_error=True)
                            self._last_optime = oplog['ts']
                            need_log = True

                        if oplog['ts'] == self._initial_sync_end_optime:
                            log.info('step into stage: oplog_sync')
                            self._stage = Stage.oplog_sync
                    else:
                        if self._multi_oplog_replayer:
                            if mongo_utils.is_command(oplog):
                                self._multi_oplog_replayer.apply()
                                self._multi_oplog_replayer.clear()
                                self._dst.apply_oplog(oplog)
                                self._last_optime = oplog['ts']
                                need_log = True
                            else:
                                self._multi_oplog_replayer.push(oplog)
                                if self._multi_oplog_replayer.count(
                                ) == self._oplog_batchsize:
                                    self._multi_oplog_replayer.apply()
                                    self._multi_oplog_replayer.clear()
                                    self._last_optime = oplog['ts']
                                    need_log = True
                        else:
                            self._dst.apply_oplog(oplog)
                            self._last_optime = oplog['ts']
                            need_log = True
                except StopIteration as e:
                    if self._multi_oplog_replayer and self._multi_oplog_replayer.count(
                    ) > 0:
                        self._multi_oplog_replayer.apply()
                        self._multi_oplog_replayer.clear()
                        self._last_optime = self._multi_oplog_replayer.last_optime(
                        )
                        need_log = True
                    # no more oplogs, wait a moment
                    time.sleep(0.1)
                    self._log_optime(self._last_optime)
                    self._log_progress('latest')
                except pymongo.errors.DuplicateKeyError as e:
                    if self._stage == Stage.oplog_sync:
                        log.error(e)
                        log.error('terminate')
                        return
                    else:
                        log.error('ignore duplicate key error: %s' % e)
                        continue
                except pymongo.errors.AutoReconnect as e:
                    log.error(e)
                    self._src.reconnect()
                    break
Exemplo n.º 5
0
class EsSyncer(CommonSyncer):
    """ Elasticsearch synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' %
                            self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' %
                            self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

    def _action_buf_full(self):
        return len(self._action_buf) >= 40

    def _sync_database(self, dbname):
        """ Sync a database.
        """
        log.info("sync database '%s'" % dbname)
        # create index
        idxname = self._conf.db_mapping(dbname)
        if self._dst.client().indices.exists(index=idxname):
            log.info('index already existed: %s' % idxname)
        else:
            log.info('create index: %s' % idxname)
            self._dst.client().indices.create(index=idxname)
        self._sync_collections(dbname)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname,
                                                       src_collname)
        fields = self._conf.fieldmap.get(
            gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" %
                         (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=None,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc,
                                                 fields) if fields else doc
                    if source:
                        actions.append({
                            '_op_type': 'index',
                            '_index': idxname,
                            '_type': typename,
                            '_id': id,
                            '_source': source
                        })
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write, groups[i])
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' %
                                 (src_dbname, src_collname, n, count,
                                  float(n) / count * 100))

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write, groups[i])
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(),
                                               actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' %
                         (src_dbname, src_collname, n, count,
                          float(n) / count * 100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' %
                         (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection(
                    'oplog.rs',
                    codec_options=bson.codec_options.CodecOptions(
                        document_class=bson.son.SON))
                cursor = coll.find(
                    {'ts': {
                        '$gte': oplog_start
                    }},
                    cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                    no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' %
                                          oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({
                                    '_op_type': 'index',
                                    '_index': idxname,
                                    '_type': typename,
                                    '_id': id,
                                    '_source': doc
                                })

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(
                                            k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'doc':
                                        doc,
                                        'doc_as_upsert':
                                        True
                                    })

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append(
                                                'ctx._source.%s.remove("%s")' %
                                                (keypath[:pos],
                                                 keypath[pos + 1:]))
                                        else:
                                            script_statements.append(
                                                'ctx._source.remove("%s")' %
                                                keypath)
                                if script_statements:
                                    doc = {
                                        'script': '; '.join(script_statements)
                                    }
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'script':
                                        doc['script']
                                    })

                            if '$set' not in oplog[
                                    'o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({
                                '_op_type': 'delete',
                                '_index': idxname,
                                '_type': typename,
                                '_id': id
                            })

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn(
                                    'you should implement document type deletion.'
                                )
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(
                                    index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return
Exemplo n.º 6
0
class EsSyncer(CommonSyncer):
    """ Elasticsearch synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

    def _action_buf_full(self):
        return len(self._action_buf) >= 40

    def _sync_database(self, dbname):
        """ Sync a database.
        """
        log.info("sync database '%s'" % dbname)
        # create index
        idxname = self._conf.db_mapping(dbname)
        if self._dst.client().indices.exists(index=idxname):
            log.info('index already existed: %s' % idxname)
        else:
            log.info('create index: %s' % idxname)
            self._dst.client().indices.create(index=idxname)
        self._sync_collections(dbname)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname)
        fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc, fields) if fields else doc
                    if source:
                        actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source})
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))

                if len(groups) > 0:
                    threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))]
                    gevent.joinall(threads)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection('oplog.rs',
                                                                  codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON))
                cursor = coll.find({'ts': {'$gte': oplog_start}},
                                   cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                                   no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' % oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc})

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'doc': doc,
                                                             'doc_as_upsert': True})

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:]))
                                        else:
                                            script_statements.append('ctx._source.remove("%s")' % keypath)
                                if script_statements:
                                    doc = {'script': '; '.join(script_statements)}
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'script': doc['script']})

                            if '$set' not in oplog['o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id})

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn('you should implement document type deletion.')
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return