示例#1
0
    def _initial_sync(self):
        """ Initial sync.
        """
        def classify(ns_tuple, large_colls, small_colls):
            """ Find out large and small collections.
            """
            if self._is_large_collection(ns_tuple):
                points = self._split_coll(ns_tuple, self._n_workers)
                if points:
                    large_colls.append((ns_tuple, points))
                else:
                    small_colls.append(ns_tuple)
            else:
                small_colls.append(ns_tuple)

        large_colls = []
        small_colls = []

        pool = gevent.pool.Pool(8)
        colls = self._collect_colls()
        for ns in colls:
            dbname, collname = ns
            log.info('%d\t%s.%s' %
                     (self._src.client()[dbname][collname].count(), dbname,
                      collname))
            pool.spawn(classify, ns, large_colls, small_colls)
        pool.join()

        if len(large_colls) + len(small_colls) != len(colls):
            raise RuntimeError('classify collections error')

        log.info('large collections: %s' %
                 ['.'.join(ns) for ns, points in large_colls])
        log.info('small collections: %s' %
                 ['.'.join(ns) for ns in small_colls])

        # create progress logger
        self._progress_logger = LoggerThread(len(colls))
        self._progress_logger.start()

        # small collections first
        pool = gevent.pool.Pool(8)
        for res in pool.imap(self._sync_collection, small_colls):
            if res is not None:
                sys.exit(1)

        # then large collections
        for ns, points in large_colls:
            self._sync_large_collection(ns, points)
示例#2
0
class CommonSyncer(object):
    """ Common database synchronizer.

    Specific database synchronizer should implement the following methods:
        - __init__
        - _initial_sync
        - _sync_collection
        - _sync_large_collection
        - _replay_oplog
    """
    def __init__(self, conf):
        if not isinstance(conf, Config):
            raise RuntimeError('invalid config type')
        self._conf = conf

        self._ignore_dbs = ['admin', 'local']
        self._ignore_colls = [
            'system.indexes', 'system.profile', 'system.users'
        ]

        if conf.optime_logfilepath:
            self._optime_logger = OptimeLogger(conf.optime_logfilepath)
        else:
            self._optime_logger = None
        self._optime_log_interval = 10  # default 10s
        self._last_optime = None  # optime of the last oplog was applied
        self._last_optime_logtime = time.time()

        self._log_interval = 2  # default 2s
        self._last_logtime = time.time()  # use in oplog replay

        # for large collections
        self._n_workers = 8  # multi-process
        self._large_coll_docs = 1000000  # 100w

        self._initial_sync_start_optime = None
        self._initial_sync_end_optime = None

        self._stage = Stage.STOPPED

    @property
    def from_to(self):
        return "%s => %s" % (self._conf.src_hostportstr,
                             self._conf.dst_hostportstr)

    @property
    def log_interval(self):
        return self._log_interval

    @log_interval.setter
    def log_interval(self, n_secs):
        if n_secs < 0:
            n_secs = 0
        self._log_interval = n_secs

    def run(self):
        """ Start to sync.
        """
        # never drop database automatically
        # clear data manually if necessary
        try:
            self._sync()
        except KeyboardInterrupt:
            log.info('keyboard interrupt')

    def _sync(self):
        """ Sync databases and oplog.
        """
        if self._conf.start_optime:
            log.info("locating oplog, it will take a while")
            doc = None
            cur = self._src.client()['local']['oplog.rs'].find(
                {
                    'ts': {
                        '$lte': self._conf.start_optime
                    }
                }, {
                    "ts": 1
                }).sort("$natural", -1).limit(1)
            try:
                doc = cur.next()
            except StopIteration:
                pass
            if not doc:
                log.error('oplog is stale')
                return
            start_optime = doc['ts']
            log.info('start timestamp is %s actually' % start_optime)
            self._stage = Stage.OPLOG_SYNC
            self._replay_oplog(start_optime)
        else:
            # initial sync
            self._initial_sync_start_optime = get_optime(self._src.client())
            self._stage = Stage.INITIAL_SYNC

            self._initial_sync()

            self._stage = Stage.POST_INITIAL_SYNC
            self._initial_sync_end_optime = get_optime(self._src.client())

            # oplog sync
            if self._optime_logger:
                self._optime_logger.write(self._initial_sync_start_optime)
            self._replay_oplog(self._initial_sync_start_optime)

    def _collect_colls(self):
        """ Collect collections to sync.
        """
        colls = []
        for dbname in self._src.client().database_names():
            if dbname in self._ignore_dbs:
                continue
            if not self._conf.data_filter.valid_db(dbname):
                continue
            for collname in self._src.client()[dbname].collection_names(
                    include_system_collections=False):
                if collname in self._ignore_colls:
                    continue
                if not self._conf.data_filter.valid_coll(dbname, collname):
                    continue
                colls.append((dbname, collname))
        return colls

    def _split_coll(self, namespace_tuple, n_partitions):
        """ Split a collection into n partitions.

        Return a list of split points.

        splitPointCount = partitionCount - 1
        splitPointCount = keyTotalCount / (keyCount + 1)
        keyCount = maxChunkSize / (2 * avgObjSize)
        =>
        maxChunkSize = (keyTotalCount / (partionCount - 1) - 1) * 2 * avgObjSize

        Note: maxChunkObjects is default 250000.
        """
        if n_partitions <= 1:
            raise RuntimeError('n_partitions need greater than 1, but %s' %
                               n_partitions)

        dbname, collname = namespace_tuple
        ns = '.'.join(namespace_tuple)
        db = self._src.client()[dbname]
        collstats = db.command('collstats', collname)

        if 'avgObjSize' not in collstats:  # empty collection
            return []

        n_points = n_partitions - 1
        max_chunk_size = int(
            ((collstats['count'] /
              (n_partitions - 1) - 1) * 2 * collstats['avgObjSize']) / 1024 /
            1024)

        if max_chunk_size <= 0:
            return []

        res = db.command('splitVector',
                         ns,
                         keyPattern={'_id': 1},
                         maxSplitPoints=n_points,
                         maxChunkSize=max_chunk_size,
                         maxChunkObjects=collstats['count'])

        if res['ok'] != 1:
            return []
        else:
            return [doc['_id'] for doc in res['splitKeys']]

    def _initial_sync(self):
        """ Initial sync.
        """
        def classify(ns_tuple, large_colls, small_colls):
            """ Find out large and small collections.
            """
            if self._is_large_collection(ns_tuple):
                points = self._split_coll(ns_tuple, self._n_workers)
                if points:
                    large_colls.append((ns_tuple, points))
                else:
                    small_colls.append(ns_tuple)
            else:
                small_colls.append(ns_tuple)

        large_colls = []
        small_colls = []

        pool = gevent.pool.Pool(8)
        colls = self._collect_colls()
        for ns in colls:
            dbname, collname = ns
            log.info('%d\t%s.%s' %
                     (self._src.client()[dbname][collname].count(), dbname,
                      collname))
            pool.spawn(classify, ns, large_colls, small_colls)
        pool.join()

        if len(large_colls) + len(small_colls) != len(colls):
            raise RuntimeError('classify collections error')

        log.info('large collections: %s' %
                 ['.'.join(ns) for ns, points in large_colls])
        log.info('small collections: %s' %
                 ['.'.join(ns) for ns in small_colls])

        # create progress logger
        self._progress_logger = LoggerThread(len(colls))
        self._progress_logger.start()

        # small collections first
        pool = gevent.pool.Pool(8)
        for res in pool.imap(self._sync_collection, small_colls):
            if res is not None:
                sys.exit(1)

        # then large collections
        for ns, points in large_colls:
            self._sync_large_collection(ns, points)

    def _sync_collection(self, namespace_tuple):
        """ Sync a collection until success.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_collection.__name__))

    def _is_large_collection(self, namespace_tuple):
        """ Check if large collection or not.
        """
        dbname, collname = namespace_tuple
        return True if self._src.client()[dbname][collname].count(
        ) > self._large_coll_docs else False

    def _sync_large_collection(self, namespace_tuple, split_points):
        """ Sync large collection until success.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._sync_large_collection.__name__))

    def _replay_oplog(self, oplog_start):
        """ Replay oplog.
        """
        raise NotImplementedError(
            'you should implement %s.%s' %
            (self.__class__.__name__, self._replay_oplog.__name__))

    def _log_progress(self, tag=''):
        """ Print progress periodically.
        """
        now = time.time()
        if now - self._last_logtime >= self._log_interval:
            delay = now - self._last_optime.time
            time_unit = 'second' if delay <= 1 else 'seconds'
            if tag:
                log.info(
                    '%s - sync to %s - %d %s delay - %s - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime, tag))
            else:
                log.info(
                    '%s - sync to %s - %d %s delay - %s' %
                    (self.from_to,
                     datetime.datetime.fromtimestamp(self._last_optime.time),
                     delay, time_unit, self._last_optime))
            self._last_logtime = now

    def _log_optime(self, optime):
        """ Record optime periodically.
        """
        if not self._optime_logger:
            return
        now = time.time()
        if now - self._last_optime_logtime >= self._optime_log_interval:
            self._optime_logger.write(optime)
            self._last_optime_logtime = now
            log.info("flush optime into file '%s': %s" %
                     (self._optime_logger.filepath, optime))