Python MongoHandler примеры использования

Язык программирования: Python

Пространство имен/Пакет: mongosync.mongo.handler

Класс/Тип: MongoHandler

Примеров на hotexamples.com: 6

Python MongoHandler - 6 примеров найдено. Это лучшие примеры Python кода для mongosync.mongo.handler.MongoHandler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MongoHandler(2)

client(2)

connect(2)

reconnect(2)

apply_oplog(1)

bulk_write(1)

create_index(1)

replay_oplog(1)

tail_oplog(1)

Пример #1

Показать файл

    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise RuntimeError('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise RuntimeError('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)
        if not isinstance(self._conf.dst_conf, MongoConfig):
            raise RuntimeError('invalid dst config type')
        self._dst = MongoHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise RuntimeError('connect to mongodb(dst) failed: %s' % self._conf.dst_hostportstr)
        self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10)

Пример #2

Показать файл

Файл: syncer.py Проект: divomen/py-mongo-sync

    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

Пример #3

Показать файл

Файл: syncer.py Проект: caosiyang/py-mongo-sync

    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

Пример #4

Показать файл

class MongoSyncer(CommonSyncer):
    """ MongoDB synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise RuntimeError('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise RuntimeError('connect to mongodb(src) failed: %s' %
                               self._conf.src_hostportstr)
        if not isinstance(self._conf.dst_conf, MongoConfig):
            raise RuntimeError('invalid dst config type')
        self._dst = MongoHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise RuntimeError('connect to mongodb(dst) failed: %s' %
                               self._conf.dst_hostportstr)
        self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10)

    def _create_index(self, namespace_tuple):
        """ Create indexes.
        """
        def format(key_direction_list):
            """ Format key and direction of index.
            """
            res = []
            for key, direction in key_direction_list:
                if isinstance(direction, float) or isinstance(direction, long):
                    direction = int(direction)
                res.append((key, direction))
            return res

        dbname, collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname)
        index_info = self._src.client()[dbname][collname].index_information()
        for name, info in index_info.iteritems():
            keys = info['key']
            options = {}
            options['name'] = name
            if 'unique' in info:
                options['unique'] = info['unique']
            if 'sparse' in info:
                options['sparse'] = info['sparse']
            if 'expireAfterSeconds' in info:
                options['expireAfterSeconds'] = info['expireAfterSeconds']
            if 'partialFilterExpression' in info:
                options['partialFilterExpression'] = info[
                    'partialFilterExpression']
            if 'dropDups' in info:
                options['dropDups'] = info['dropDups']

            ## create indexes before import documents, so not need 'background' option
            # if 'background' in info:
            #     options['background'] = info['background']

            # for text index
            if 'weights' in info:
                options['weights'] = info['weights']
            if 'default_language' in info:
                options['default_language'] = info['default_language']
            if 'language_override' in info:
                options['language_override'] = info['language_override']

            self._dst.create_index(dst_dbname, dst_collname, format(keys),
                                   **options)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        # create indexes first
        self._create_index(namespace_tuple)

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(
            src_dbname, src_collname)
        src_ns = '%s.%s' % (src_dbname, src_collname)

        total = self._src.client()[src_dbname][src_collname].count()
        self._progress_logger.register(src_ns, total)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=None,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    modifiers={'$snapshot': True})

                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10
                n = 0

                for doc in cursor:
                    reqs.append(
                        pymongo.ReplaceOne({'_id': doc['_id']},
                                           doc,
                                           upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write,
                                         dst_dbname,
                                         dst_collname,
                                         groups[i],
                                         ordered=False,
                                         ignore_duplicate_key_error=True)
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads, raise_error=True)
                        groups = []

                    n += 1
                    if n % 10000 == 0:
                        self._progress_logger.add(src_ns, n)
                        n = 0

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write,
                                     dst_dbname,
                                     dst_collname,
                                     groups[i],
                                     ordered=False,
                                     ignore_duplicate_key_error=True)
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads, raise_error=True)
                if len(reqs) > 0:
                    self._dst.bulk_write(dst_dbname,
                                         dst_collname,
                                         reqs,
                                         ordered=False,
                                         ignore_duplicate_key_error=True)

                self._progress_logger.add(src_ns, n, done=True)
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _sync_large_collection(self, namespace_tuple, split_points):
        """ Sync large collection.
        """
        # create indexes first
        self._create_index(namespace_tuple)

        dbname, collname = namespace_tuple
        ns = '.'.join(namespace_tuple)

        log.info('pending to sync %s with %d processes' %
                 (ns, len(split_points) + 1))

        coll = self._src.client()[dbname][collname]
        total = coll.count()
        self._progress_logger.register(ns, total)

        prog_q = multiprocessing.Queue()
        res_q = multiprocessing.Queue()

        proc_logging = multiprocessing.Process(target=logging_progress,
                                               args=(ns, total, prog_q))
        proc_logging.start()

        queries = []
        lower_bound = None
        for point in split_points:
            if lower_bound is None:
                queries.append({'_id': {'$lt': point}})
            else:
                queries.append({'_id': {'$gte': lower_bound, '$lt': point}})
            lower_bound = point
        queries.append({'_id': {'$gte': lower_bound}})

        procs = []
        for query in queries:
            p = multiprocessing.Process(
                target=self._sync_collection_with_query,
                args=(namespace_tuple, query, prog_q, res_q))
            p.start()
            procs.append(p)
            log.info('start process %s with query %s' % (p.name, query))

        for p in procs:
            p.join()

        n_docs = 0
        for p in procs:
            n_docs += res_q.get()
        self._progress_logger.add(ns, n_docs, done=True)

        prog_q.put(True)
        prog_q.close()
        prog_q.join_thread()
        proc_logging.join()

    def _sync_collection_with_query(self, namespace_tuple, query, prog_q,
                                    res_q):
        """ Sync collection with query.
        """
        self._src.reconnect()
        self._dst.reconnect()

        src_dbname, src_collname = namespace_tuple
        dst_dbname, dst_collname = self._conf.db_coll_mapping(
            src_dbname, src_collname)

        while True:
            try:
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=query,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    # snapshot cause blocking, maybe bug
                    # modifiers={'$snapshot': True}
                )
                total = 0
                n = 0
                reqs = []
                reqs_max = 100
                groups = []
                groups_max = 10

                for doc in cursor:
                    reqs.append(
                        pymongo.ReplaceOne({'_id': doc['_id']},
                                           doc,
                                           upsert=True))
                    if len(reqs) == reqs_max:
                        groups.append(reqs)
                        reqs = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write,
                                         dst_dbname,
                                         dst_collname,
                                         groups[i],
                                         ordered=False,
                                         ignore_duplicate_key_error=True)
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads, raise_error=True)
                        groups = []

                    n += 1
                    total += 1
                    if n % 10000 == 0:
                        prog_q.put(n)
                        n = 0

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write,
                                     dst_dbname,
                                     dst_collname,
                                     groups[i],
                                     ordered=False,
                                     ignore_duplicate_key_error=True)
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads, raise_error=True)
                if len(reqs) > 0:
                    self._dst.bulk_write(dst_dbname,
                                         dst_collname,
                                         reqs,
                                         ordered=False,
                                         ignore_duplicate_key_error=True)

                if n > 0:
                    prog_q.put(n)
                res_q.put(total)

                prog_q.close()
                prog_q.join_thread()
                res_q.close()
                res_q.join_thread()
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, start_optime):
        """ Replay oplog.
        """
        self._last_optime = start_optime

        n_total = 0
        n_skip = 0

        while True:
            try:
                start_optime_valid = False
                need_log = False
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' %
                         (self._last_optime, host, port))
                cursor = self._src.tail_oplog(start_optime)
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % self._last_optime)
                return
            except Exception as e:
                log.error('get oplog cursor failed: %s' % e)
                continue

            # loop: read and apply oplog
            while True:
                try:
                    if need_log:
                        self._log_optime(self._last_optime)
                        self._log_progress()
                        need_log = False

                    if not cursor.alive:
                        log.error('cursor is dead')
                        raise pymongo.errors.AutoReconnect

                    oplog = cursor.next()
                    n_total += 1

                    # check start optime once
                    if not start_optime_valid:
                        if oplog['ts'] == self._last_optime:
                            log.info('oplog is ok: %s' % self._last_optime)
                            start_optime_valid = True
                        else:
                            log.error('oplog %s is stale, terminate' %
                                      self._last_optime)
                            return

                    if oplog['op'] == 'n':  # no-op
                        self._last_optime = oplog['ts']
                        need_log = True
                        continue

                    # validate oplog
                    if not self._conf.data_filter.valid_oplog(oplog):
                        n_skip += 1
                        self._last_optime = oplog['ts']
                        need_log = True
                        continue

                    dbname, collname = mongo_utils.parse_namespace(oplog['ns'])
                    dst_dbname, dst_collname = self._conf.db_coll_mapping(
                        dbname, collname)
                    if dst_dbname != dbname or dst_collname != collname:
                        oplog['ns'] = '%s.%s' % (dst_dbname, dst_collname)

                    if self._stage == Stage.post_initial_sync:
                        if self._multi_oplog_replayer:
                            if mongo_utils.is_command(oplog):
                                self._multi_oplog_replayer.apply(
                                    ignore_duplicate_key_error=True)
                                self._multi_oplog_replayer.clear()
                                self._dst.apply_oplog(oplog)
                                self._last_optime = oplog['ts']
                                need_log = True
                            else:
                                self._multi_oplog_replayer.push(oplog)
                                if oplog[
                                        'ts'] == self._initial_sync_end_optime or self._multi_oplog_replayer.count(
                                        ) == self._oplog_batchsize:
                                    self._multi_oplog_replayer.apply(
                                        ignore_duplicate_key_error=True)
                                    self._multi_oplog_replayer.clear()
                                    self._last_optime = oplog['ts']
                                    need_log = True
                        else:
                            self._dst.apply_oplog(
                                oplog, ignore_duplicate_key_error=True)
                            self._last_optime = oplog['ts']
                            need_log = True

                        if oplog['ts'] == self._initial_sync_end_optime:
                            log.info('step into stage: oplog_sync')
                            self._stage = Stage.oplog_sync
                    else:
                        if self._multi_oplog_replayer:
                            if mongo_utils.is_command(oplog):
                                self._multi_oplog_replayer.apply()
                                self._multi_oplog_replayer.clear()
                                self._dst.apply_oplog(oplog)
                                self._last_optime = oplog['ts']
                                need_log = True
                            else:
                                self._multi_oplog_replayer.push(oplog)
                                if self._multi_oplog_replayer.count(
                                ) == self._oplog_batchsize:
                                    self._multi_oplog_replayer.apply()
                                    self._multi_oplog_replayer.clear()
                                    self._last_optime = oplog['ts']
                                    need_log = True
                        else:
                            self._dst.apply_oplog(oplog)
                            self._last_optime = oplog['ts']
                            need_log = True
                except StopIteration as e:
                    if self._multi_oplog_replayer and self._multi_oplog_replayer.count(
                    ) > 0:
                        self._multi_oplog_replayer.apply()
                        self._multi_oplog_replayer.clear()
                        self._last_optime = self._multi_oplog_replayer.last_optime(
                        )
                        need_log = True
                    # no more oplogs, wait a moment
                    time.sleep(0.1)
                    self._log_optime(self._last_optime)
                    self._log_progress('latest')
                except pymongo.errors.DuplicateKeyError as e:
                    if self._stage == Stage.oplog_sync:
                        log.error(e)
                        log.error('terminate')
                        return
                    else:
                        log.error('ignore duplicate key error: %s' % e)
                        continue
                except pymongo.errors.AutoReconnect as e:
                    log.error(e)
                    self._src.reconnect()
                    break

Пример #5

Показать файл

class EsSyncer(CommonSyncer):
    """ Elasticsearch synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' %
                            self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' %
                            self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

    def _action_buf_full(self):
        return len(self._action_buf) >= 40

    def _sync_database(self, dbname):
        """ Sync a database.
        """
        log.info("sync database '%s'" % dbname)
        # create index
        idxname = self._conf.db_mapping(dbname)
        if self._dst.client().indices.exists(index=idxname):
            log.info('index already existed: %s' % idxname)
        else:
            log.info('create index: %s' % idxname)
            self._dst.client().indices.create(index=idxname)
        self._sync_collections(dbname)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname,
                                                       src_collname)
        fields = self._conf.fieldmap.get(
            gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" %
                         (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(
                    filter=None,
                    cursor_type=pymongo.cursor.CursorType.EXHAUST,
                    no_cursor_timeout=True,
                    modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc,
                                                 fields) if fields else doc
                    if source:
                        actions.append({
                            '_op_type': 'index',
                            '_index': idxname,
                            '_type': typename,
                            '_id': id,
                            '_source': source
                        })
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [
                            gevent.spawn(self._dst.bulk_write, groups[i])
                            for i in xrange(groups_max)
                        ]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' %
                                 (src_dbname, src_collname, n, count,
                                  float(n) / count * 100))

                if len(groups) > 0:
                    threads = [
                        gevent.spawn(self._dst.bulk_write, groups[i])
                        for i in xrange(len(groups))
                    ]
                    gevent.joinall(threads)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(),
                                               actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' %
                         (src_dbname, src_collname, n, count,
                          float(n) / count * 100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' %
                         (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection(
                    'oplog.rs',
                    codec_options=bson.codec_options.CodecOptions(
                        document_class=bson.son.SON))
                cursor = coll.find(
                    {'ts': {
                        '$gte': oplog_start
                    }},
                    cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                    no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' %
                                          oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({
                                    '_op_type': 'index',
                                    '_index': idxname,
                                    '_type': typename,
                                    '_id': id,
                                    '_source': doc
                                })

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            fields = self._conf.fieldmap.get(
                                gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(
                                            k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'doc':
                                        doc,
                                        'doc_as_upsert':
                                        True
                                    })

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append(
                                                'ctx._source.%s.remove("%s")' %
                                                (keypath[:pos],
                                                 keypath[pos + 1:]))
                                        else:
                                            script_statements.append(
                                                'ctx._source.remove("%s")' %
                                                keypath)
                                if script_statements:
                                    doc = {
                                        'script': '; '.join(script_statements)
                                    }
                                    self._action_buf.append({
                                        '_op_type':
                                        'update',
                                        '_index':
                                        idxname,
                                        '_type':
                                        typename,
                                        '_id':
                                        id,
                                        '_retry_on_conflict':
                                        3,
                                        'script':
                                        doc['script']
                                    })

                            if '$set' not in oplog[
                                    'o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(
                                dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({
                                '_op_type': 'delete',
                                '_index': idxname,
                                '_type': typename,
                                '_id': id
                            })

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn(
                                    'you should implement document type deletion.'
                                )
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(
                                    index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return

Пример #6

Показать файл

Файл: syncer.py Проект: caosiyang/py-mongo-sync

class EsSyncer(CommonSyncer):
    """ Elasticsearch synchronizer.
    """
    def __init__(self, conf):
        CommonSyncer.__init__(self, conf)

        if not isinstance(self._conf.src_conf, MongoConfig):
            raise Exception('invalid src config type')
        self._src = MongoHandler(self._conf.src_conf)
        if not self._src.connect():
            raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr)

        if not isinstance(self._conf.dst_conf, EsConfig):
            raise Exception('invalid dst config type')
        self._dst = EsHandler(self._conf.dst_conf)
        if not self._dst.connect():
            raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr)

        self._action_buf = []  # used to bulk write oplogs
        self._last_bulk_optime = None

    def _action_buf_full(self):
        return len(self._action_buf) >= 40

    def _sync_database(self, dbname):
        """ Sync a database.
        """
        log.info("sync database '%s'" % dbname)
        # create index
        idxname = self._conf.db_mapping(dbname)
        if self._dst.client().indices.exists(index=idxname):
            log.info('index already existed: %s' % idxname)
        else:
            log.info('create index: %s' % idxname)
            self._dst.client().indices.create(index=idxname)
        self._sync_collections(dbname)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1]
        idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname)
        fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname))

        while True:
            try:
                log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename))
                cursor = self._src.client()[src_dbname][src_collname].find(filter=None,
                                                                           cursor_type=pymongo.cursor.CursorType.EXHAUST,
                                                                           no_cursor_timeout=True,
                                                                           modifiers={'$snapshot': True})
                count = cursor.count()
                if count == 0:
                    log.info('    skip empty collection')
                    return

                n = 0
                actions = []
                actions_max = 20
                groups = []
                groups_max = 10

                for doc in cursor:
                    id = str(doc['_id'])
                    del doc['_id']
                    source = gen_doc_with_fields(doc, fields) if fields else doc
                    if source:
                        actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source})
                    if len(actions) == actions_max:
                        groups.append(actions)
                        actions = []
                    if len(groups) == groups_max:
                        threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)]
                        gevent.joinall(threads)
                        groups = []

                    n += 1
                    if n % 1000 == 0:
                        log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))

                if len(groups) > 0:
                    threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))]
                    gevent.joinall(threads)
                if len(actions) > 0:
                    elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions)

                log.info('    %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100))
                return
            except pymongo.errors.AutoReconnect:
                self._src.reconnect()

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        self._last_bulk_optime = oplog_start

        n_total = 0
        n_skip = 0

        while True:
            # try to get cursor until success
            try:
                host, port = self._src.client().address
                log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port))
                # set codec options to guarantee the order of keys in command
                coll = self._src.client()['local'].get_collection('oplog.rs',
                                                                  codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON))
                cursor = coll.find({'ts': {'$gte': oplog_start}},
                                   cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT,
                                   no_cursor_timeout=True)

                # New in version 3.2
                # src_version = mongo_utils.get_version(self._src.client())
                # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'):
                #     cursor.max_await_time_ms(1000)

                valid_start_optime = False  # need to validate

                while True:
                    try:
                        if not cursor.alive:
                            log.error('cursor is dead')
                            raise pymongo.errors.AutoReconnect

                        oplog = cursor.next()
                        n_total += 1

                        if not valid_start_optime:
                            if oplog['ts'] == oplog_start:
                                log.info('oplog is ok: %s' % oplog_start)
                                valid_start_optime = True
                            else:
                                log.error('oplog %s is stale, terminate' % oplog_start)
                                return

                        # validate oplog
                        if not self._conf.data_filter.valid_oplog(oplog):
                            n_skip += 1
                            self._last_optime = oplog['ts']
                            continue

                        op = oplog['op']
                        ns = oplog['ns']

                        if op == 'i':  # insert
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            doc = oplog['o']
                            id = str(doc['_id'])
                            del doc['_id']
                            if fields:
                                doc = gen_doc_with_fields(doc, fields)
                            if doc:
                                self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc})

                        elif op == 'u':  # update
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            fields = self._conf.fieldmap.get(gen_namespace(dbname, collname))

                            id = str(oplog['o2']['_id'])

                            if '$set' in oplog['o']:
                                doc = {}
                                for k, v in oplog['o']['$set'].iteritems():
                                    if not fields or k in fields:
                                        sub_doc = doc_flat_to_nested(k.split('.'), v)
                                        merge_doc(doc, sub_doc)
                                if doc:
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'doc': doc,
                                                             'doc_as_upsert': True})

                            if '$unset' in oplog['o']:
                                script_statements = []
                                for keypath in oplog['o']['$unset'].iterkeys():
                                    if not fields or keypath in fields:
                                        pos = keypath.rfind('.')
                                        if pos >= 0:
                                            script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:]))
                                        else:
                                            script_statements.append('ctx._source.remove("%s")' % keypath)
                                if script_statements:
                                    doc = {'script': '; '.join(script_statements)}
                                    self._action_buf.append({'_op_type': 'update',
                                                             '_index': idxname,
                                                             '_type': typename,
                                                             '_id': id,
                                                             '_retry_on_conflict': 3,
                                                             'script': doc['script']})

                            if '$set' not in oplog['o'] and '$unset' not in oplog['o']:
                                log.warn('unexpect oplog: %s', oplog['o'])

                        elif op == 'd':  # delete
                            dbname, collname = parse_namespace(ns)
                            idxname, typename = self._conf.db_coll_mapping(dbname, collname)
                            id = str(oplog['o']['_id'])
                            self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id})

                        elif op == 'c':  # command
                            dbname, _ = parse_namespace(ns)
                            idxname = self._conf.db_mapping(dbname)
                            if 'drop' in oplog['o']:
                                # TODO
                                # how to delete type?
                                pass
                                log.warn('you should implement document type deletion.')
                            if 'dropDatabase' in oplog['o']:
                                # delete index
                                self._dst.client().indices.delete(index=idxname)

                        elif op == 'n':  # no-op
                            pass
                        else:
                            log.error('invalid optype: %s' % oplog)

                        # flush
                        if self._action_buf_full():
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = oplog['ts']

                        self._last_optime = oplog['ts']
                        self._log_optime(oplog['ts'])
                        self._log_progress()
                    except StopIteration as e:
                        # flush
                        if len(self._action_buf) > 0:
                            self._dst.bulk_write(self._action_buf)
                            self._action_buf = []
                            self._last_bulk_optime = self._last_optime
                        self._log_optime(self._last_optime)
                        self._log_progress('latest')
                        time.sleep(0.1)
                    except pymongo.errors.AutoReconnect as e:
                        log.error(e)
                        self._src.reconnect()
                        break
                    except elasticsearch.helpers.BulkIndexError as e:
                        log.error(e)
                        self._action_buf = []
            except IndexError as e:
                log.error(e)
                log.error('%s not found, terminate' % oplog_start)
                return