def __init__(self, conf): if not isinstance(conf, Config): raise RuntimeError('invalid config type') self._conf = conf self._ignore_dbs = ['admin', 'local'] self._ignore_colls = [ 'system.indexes', 'system.profile', 'system.users' ] if conf.optime_logfilepath: self._optime_logger = OptimeLogger(conf.optime_logfilepath) else: self._optime_logger = None self._optime_log_interval = 10 # default 10s self._last_optime = None # optime of the last oplog was applied self._last_optime_logtime = time.time() self._log_interval = 2 # default 2s self._last_logtime = time.time() # use in oplog replay # for large collections self._n_workers = 8 # multi-process self._large_coll_docs = 1000000 # 100w self._initial_sync_start_optime = None self._initial_sync_end_optime = None self._stage = Stage.STOPPED
def parse(): """ Parse command options and generate config. """ conf = Config() parser = argparse.ArgumentParser(description='Sync data from a replica-set to another MongoDB/Elasticsearch.') parser.add_argument('-f', '--config', nargs='?', required=False, help='configuration file, note that command options will override items in config file') parser.add_argument('--src', nargs='?', required=False, help='source should be hostportstr of a replica-set member') parser.add_argument('--src-authdb', nargs='?', required=False, help="src authentication database, default is 'admin'") parser.add_argument('--src-username', nargs='?', required=False, help='src username') parser.add_argument('--src-password', nargs='?', required=False, help='src password') parser.add_argument('--dst', nargs='?', required=False, help='destination should be hostportstr of a mongos or mongod instance') parser.add_argument('--dst-authdb', nargs='?', required=False, help="dst authentication database, default is 'admin', for MongoDB") parser.add_argument('--dst-username', nargs='?', required=False, help='dst username, for MongoDB') parser.add_argument('--dst-password', nargs='?', required=False, help='dst password, for MongoDB') parser.add_argument('--start-optime', type=int, nargs='?', required=False, help='timestamp in second, indicates oplog based increment sync') parser.add_argument('--optime-logfile', nargs='?', required=False, help="optime log file path, use this as start optime if without '--start-optime'") parser.add_argument('--logfile', nargs='?', required=False, help='log file path') args = parser.parse_args() if args.config is not None: conf = ConfigFile.load(args.config) if args.src is not None: conf.src_conf.hosts = args.src if args.src_authdb is not None: conf.src_conf.authdb = args.src_authdb if args.src_username is not None: conf.src_conf.username = args.src_username if args.src_password is not None: conf.src_conf.password = args.src_password if args.dst is not None: conf.dst_conf.hosts = args.dst if args.dst_authdb is not None: conf.dst_conf.authdb = args.dst_authdb if args.dst_username is not None: conf.dst_conf.username = args.dst_username if args.dst_password is not None: conf.dst_conf.password = args.dst_password if args.start_optime is not None: conf.start_optime = Timestamp(args.start_optime, 0) if args.optime_logfile is not None: conf.optime_logfilepath = args.optime_logfile if args.start_optime is None: optime_logger = OptimeLogger(args.optime_logfile) conf.start_optime = optime_logger.read() if args.logfile is not None: conf.logfilepath = args.logfile return conf
def __init__(self, conf): if not isinstance(conf, Config): raise Exception('invalid config type') self._conf = conf self._ignore_dbs = ['admin', 'local'] self._ignore_colls = [ 'system.indexes', 'system.profile', 'system.users' ] if conf.optime_logfilepath: self._optime_logger = OptimeLogger(conf.optime_logfilepath) else: self._optime_logger = None self._optime_log_interval = 10 # default 10s self._last_optime = None # optime of the last oplog has been replayed self._last_optime_logtime = time.time() self._log_interval = 2 # default 2s self._last_logtime = time.time() # use in oplog replay
class CommonSyncer(object): """ Common database synchronizer. Specific database synchronizer should implement the following methods: - __init__ - _initial_sync - _sync_collection - _sync_large_collection - _replay_oplog """ def __init__(self, conf): if not isinstance(conf, Config): raise RuntimeError('invalid config type') self._conf = conf self._ignore_dbs = ['admin', 'local'] self._ignore_colls = [ 'system.indexes', 'system.profile', 'system.users' ] if conf.optime_logfilepath: self._optime_logger = OptimeLogger(conf.optime_logfilepath) else: self._optime_logger = None self._optime_log_interval = 10 # default 10s self._last_optime = None # optime of the last oplog was applied self._last_optime_logtime = time.time() self._log_interval = 2 # default 2s self._last_logtime = time.time() # use in oplog replay # for large collections self._n_workers = 8 # multi-process self._large_coll_docs = 1000000 # 100w self._initial_sync_start_optime = None self._initial_sync_end_optime = None self._stage = Stage.STOPPED @property def from_to(self): return "%s => %s" % (self._conf.src_hostportstr, self._conf.dst_hostportstr) @property def log_interval(self): return self._log_interval @log_interval.setter def log_interval(self, n_secs): if n_secs < 0: n_secs = 0 self._log_interval = n_secs def run(self): """ Start to sync. """ # never drop database automatically # clear data manually if necessary try: self._sync() except KeyboardInterrupt: log.info('keyboard interrupt') def _sync(self): """ Sync databases and oplog. """ if self._conf.start_optime: log.info("locating oplog, it will take a while") doc = None cur = self._src.client()['local']['oplog.rs'].find( { 'ts': { '$lte': self._conf.start_optime } }, { "ts": 1 }).sort("$natural", -1).limit(1) try: doc = cur.next() except StopIteration: pass if not doc: log.error('oplog is stale') return start_optime = doc['ts'] log.info('start timestamp is %s actually' % start_optime) self._stage = Stage.OPLOG_SYNC self._replay_oplog(start_optime) else: # initial sync self._initial_sync_start_optime = get_optime(self._src.client()) self._stage = Stage.INITIAL_SYNC self._initial_sync() self._stage = Stage.POST_INITIAL_SYNC self._initial_sync_end_optime = get_optime(self._src.client()) # oplog sync if self._optime_logger: self._optime_logger.write(self._initial_sync_start_optime) self._replay_oplog(self._initial_sync_start_optime) def _collect_colls(self): """ Collect collections to sync. """ colls = [] for dbname in self._src.client().database_names(): if dbname in self._ignore_dbs: continue if not self._conf.data_filter.valid_db(dbname): continue for collname in self._src.client()[dbname].collection_names( include_system_collections=False): if collname in self._ignore_colls: continue if not self._conf.data_filter.valid_coll(dbname, collname): continue colls.append((dbname, collname)) return colls def _split_coll(self, namespace_tuple, n_partitions): """ Split a collection into n partitions. Return a list of split points. splitPointCount = partitionCount - 1 splitPointCount = keyTotalCount / (keyCount + 1) keyCount = maxChunkSize / (2 * avgObjSize) => maxChunkSize = (keyTotalCount / (partionCount - 1) - 1) * 2 * avgObjSize Note: maxChunkObjects is default 250000. """ if n_partitions <= 1: raise RuntimeError('n_partitions need greater than 1, but %s' % n_partitions) dbname, collname = namespace_tuple ns = '.'.join(namespace_tuple) db = self._src.client()[dbname] collstats = db.command('collstats', collname) if 'avgObjSize' not in collstats: # empty collection return [] n_points = n_partitions - 1 max_chunk_size = int( ((collstats['count'] / (n_partitions - 1) - 1) * 2 * collstats['avgObjSize']) / 1024 / 1024) if max_chunk_size <= 0: return [] res = db.command('splitVector', ns, keyPattern={'_id': 1}, maxSplitPoints=n_points, maxChunkSize=max_chunk_size, maxChunkObjects=collstats['count']) if res['ok'] != 1: return [] else: return [doc['_id'] for doc in res['splitKeys']] def _initial_sync(self): """ Initial sync. """ def classify(ns_tuple, large_colls, small_colls): """ Find out large and small collections. """ if self._is_large_collection(ns_tuple): points = self._split_coll(ns_tuple, self._n_workers) if points: large_colls.append((ns_tuple, points)) else: small_colls.append(ns_tuple) else: small_colls.append(ns_tuple) large_colls = [] small_colls = [] pool = gevent.pool.Pool(8) colls = self._collect_colls() for ns in colls: dbname, collname = ns log.info('%d\t%s.%s' % (self._src.client()[dbname][collname].count(), dbname, collname)) pool.spawn(classify, ns, large_colls, small_colls) pool.join() if len(large_colls) + len(small_colls) != len(colls): raise RuntimeError('classify collections error') log.info('large collections: %s' % ['.'.join(ns) for ns, points in large_colls]) log.info('small collections: %s' % ['.'.join(ns) for ns in small_colls]) # create progress logger self._progress_logger = LoggerThread(len(colls)) self._progress_logger.start() # small collections first pool = gevent.pool.Pool(8) for res in pool.imap(self._sync_collection, small_colls): if res is not None: sys.exit(1) # then large collections for ns, points in large_colls: self._sync_large_collection(ns, points) def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ raise NotImplementedError( 'you should implement %s.%s' % (self.__class__.__name__, self._sync_collection.__name__)) def _is_large_collection(self, namespace_tuple): """ Check if large collection or not. """ dbname, collname = namespace_tuple return True if self._src.client()[dbname][collname].count( ) > self._large_coll_docs else False def _sync_large_collection(self, namespace_tuple, split_points): """ Sync large collection until success. """ raise NotImplementedError( 'you should implement %s.%s' % (self.__class__.__name__, self._sync_large_collection.__name__)) def _replay_oplog(self, oplog_start): """ Replay oplog. """ raise NotImplementedError( 'you should implement %s.%s' % (self.__class__.__name__, self._replay_oplog.__name__)) def _log_progress(self, tag=''): """ Print progress periodically. """ now = time.time() if now - self._last_logtime >= self._log_interval: delay = now - self._last_optime.time time_unit = 'second' if delay <= 1 else 'seconds' if tag: log.info( '%s - sync to %s - %d %s delay - %s - %s' % (self.from_to, datetime.datetime.fromtimestamp(self._last_optime.time), delay, time_unit, self._last_optime, tag)) else: log.info( '%s - sync to %s - %d %s delay - %s' % (self.from_to, datetime.datetime.fromtimestamp(self._last_optime.time), delay, time_unit, self._last_optime)) self._last_logtime = now def _log_optime(self, optime): """ Record optime periodically. """ if not self._optime_logger: return now = time.time() if now - self._last_optime_logtime >= self._optime_log_interval: self._optime_logger.write(optime) self._last_optime_logtime = now log.info("flush optime into file '%s': %s" % (self._optime_logger.filepath, optime))
def load(filepath): """ Load config file and generate conf. """ conf = Config() tml = toml.load(filepath) conf.src_conf = MongoConfig(tml['src']['hosts'], tml['src'].get('authdb', 'admin'), tml['src'].get('username', ''), tml['src'].get('password', '')) if type not in tml['dst'] or tml['dst']['type'] == 'mongo': conf.dst_conf = MongoConfig(tml['dst']['hosts'], tml['dst'].get('authdb', 'admin'), tml['dst'].get('username', ''), tml['dst'].get('password', '')) elif tml['dst']['type'] == 'es': conf.dst_conf = EsConfig(tml['dst']['hosts']) else: raise Exception('invalid dst.type') if 'sync' in tml and 'dbs' in tml['sync']: for dbentry in tml['sync']['dbs']: if 'db' not in dbentry: raise Exception("'db' is missing in sync.dbs") if not dbentry['db']: raise Exception("'db' is empty in sync.dbs") dbname = dbentry['db'].strip() rename_db = dbentry['rename_db'].strip( ) if 'rename_db' in dbentry else "" # update db map if dbname and rename_db: if dbname in conf.dbmap: raise Exception('duplicate dbname in sync.dbs: %s' % dbname) conf.dbmap[dbname] = rename_db if 'colls' in dbentry and dbentry['colls']: for collentry in dbentry['colls']: if isinstance(collentry, str): collname = collentry.strip() ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) elif isinstance(collentry, dict): if 'coll' not in collentry: raise Exception( "'coll' is missing in sync.dbs.colls") if not collentry['coll']: raise Exception( "'coll' is empty in sync.dbs.colls") collname = collentry['coll'].strip() fields = frozenset( [f.strip() for f in collentry['fields']] if 'fields' in collentry else []) # update coll filter ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) # update fields if fields: if ns in conf.fieldmap: raise Exception( "duplicate collname in sync.dbs.colls: %s" % ns) conf.fieldmap[ns] = fields else: raise Exception( 'invalid entry in sync.dbs.colls: %s' % collentry) else: # update coll filter conf.data_filter.add_include_coll( gen_namespace(dbname, '*')) if 'sync' in tml and 'start_optime' in tml['sync']: conf.start_optime = Timestamp(tml['sync']['start_optime'], 0) if 'log' in tml and 'filepath' in tml['log']: conf.logfilepath = tml['log']['filepath'] if 'log' in tml and 'op_time_path' in tml['log']: conf.optime_logfilepath = tml['log']['op_time_path'] optime_logger = OptimeLogger(conf.optime_logfilepath) if optime_logger.read(): conf.start_optime = optime_logger.read() return conf
class Synchronizer(object): """ Common synchronizer. Other synchronizer entities should implement methods: - __init__ - __del__ - _sync_database - _sync_collection - _sync_oplog """ def __init__(self, conf): if not isinstance(conf, Config): raise Exception('invalid config type') self._conf = conf self._ignore_dbs = ['admin', 'local'] self._ignore_colls = [ 'system.indexes', 'system.profile', 'system.users' ] if conf.optime_logfilepath: self._optime_logger = OptimeLogger(conf.optime_logfilepath) else: self._optime_logger = None self._optime_log_interval = 10 # default 10s self._last_optime = None # optime of the last oplog has been replayed self._last_optime_logtime = time.time() self._log_interval = 2 # default 2s self._last_logtime = time.time() # use in oplog replay @property def from_to(self): return "%s => %s" % (self._conf.src_hostportstr, self._conf.dst_hostportstr) @property def log_interval(self): return self._log_interval @log_interval.setter def log_interval(self, n_secs): if n_secs < 0: n_secs = 0 self._log_interval = n_secs def run(self): """ Start to sync. """ # never drop database automatically # you should clear the databases manually if necessary try: self._sync() except exceptions.KeyboardInterrupt: log.info('keyboard interrupt') def _sync(self): """ Sync databases and oplog. """ if self._conf.start_optime: # TODO optimize log.info("locating oplog, it will take a while") oplog_start = self._conf.start_optime doc = self._src.client()['local']['oplog.rs'].find_one( {'ts': { '$gte': oplog_start }}) if not doc: log.error('no oplogs newer than the specified oplog') return oplog_start = doc['ts'] log.info('start timestamp is %s actually' % oplog_start) self._last_optime = oplog_start self._sync_oplog(oplog_start) else: oplog_start = get_optime(self._src.client()) if not oplog_start: log.error('get oplog_start failed, terminate') sys.exit(1) self._last_optime = oplog_start self._sync_databases() if self._optime_logger: self._optime_logger.write(oplog_start) log.info('first %s' % oplog_start) self._sync_oplog(oplog_start) def _sync_databases(self): """ Sync databases excluding 'admin' and 'local'. """ host, port = self._src.client().address log.info('sync databases from %s:%d' % (host, port)) for dbname in self._src.client().database_names(): if dbname in self._ignore_dbs: log.info("skip database '%s'" % dbname) continue if not self._conf.data_filter.valid_db(dbname): log.info("skip database '%s'" % dbname) continue self._sync_database(dbname) log.info('all databases done') def _sync_database(self, dbname): """ Sync a database. """ raise Exception( 'you should implement %s.%s' % (self.__class__.__name__, self._sync_database.__name__)) def _sync_collections(self, dbname): """ Sync collections in the database excluding system collections. """ collnames = self._src.client()[dbname].collection_names( include_system_collections=False) for collname in collnames: if collname in self._ignore_colls: log.info("skip collection '%s'" % gen_namespace(dbname, collname)) continue if not self._conf.data_filter.valid_coll(dbname, collname): log.info("skip collection '%s'" % gen_namespace(dbname, collname)) continue self._sync_collection(dbname, collname) def _sync_collection(self, dbname, collname): """ Sync a collection until success. """ raise Exception( 'you should implement %s.%s' % (self.__class__.__name__, self._sync_collection.__name__)) def _sync_oplog(self, oplog_start): """ Replay oplog. """ raise Exception('you should implement %s.%s' % (self.__class__.__name__, self._sync_oplog.__name__)) def _log_progress(self, tag=''): """ Print progress. """ now = time.time() if now - self._last_logtime >= self._log_interval: delay = now - self._last_optime.time time_unit = 'second' if delay <= 1 else 'seconds' if tag: log.info( '%s - sync to %s - %d %s delay - %s - %s' % (self.from_to, datetime.datetime.fromtimestamp(self._last_optime.time), delay, time_unit, self._last_optime, tag)) else: log.info( '%s - sync to %s - %d %s delay - %s' % (self.from_to, datetime.datetime.fromtimestamp(self._last_optime.time), delay, time_unit, self._last_optime)) self._last_logtime = now def _log_optime(self, optime): """ Record optime. """ if not self._optime_logger: return now = time.time() if now - self._last_optime_logtime >= self._optime_log_interval: self._optime_logger.write(optime) self._last_optime_logtime = now log.info("flush optime into file '%s': %s" % (self._optime_logger.filepath, optime))