def _sync_collections(self, dbname): """ Sync collections in the database excluding system collections. """ collnames = self._src.client()[dbname].collection_names( include_system_collections=False) for collname in collnames: if collname in self._ignore_colls: log.info("skip collection '%s'" % gen_namespace(dbname, collname)) continue if not self._conf.data_filter.valid_coll(dbname, collname): log.info("skip collection '%s'" % gen_namespace(dbname, collname)) continue self._sync_collection(dbname, collname)
def valid_coll(self, dbname, collname): if not self._include_colls: return True else: if '%s.*' % dbname in self._include_colls: return True return gen_namespace(dbname, collname) in self._include_colls
def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1] idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname) fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname)) while True: try: log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename)) cursor = self._src.client()[src_dbname][src_collname].find(filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) count = cursor.count() if count == 0: log.info(' skip empty collection') return n = 0 actions = [] actions_max = 20 groups = [] groups_max = 10 for doc in cursor: id = str(doc['_id']) del doc['_id'] source = gen_doc_with_fields(doc, fields) if fields else doc if source: actions.append( {'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source}) if len(actions) == actions_max: groups.append(actions) actions = [] if len(groups) == groups_max: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)] gevent.joinall(threads, raise_error=True) groups = [] n += 1 if n % 1000 == 0: log.info( ' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100)) if len(groups) > 0: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))] gevent.joinall(threads, raise_error=True) if len(actions) > 0: elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions) log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100)) return except pymongo.errors.AutoReconnect: self._src.reconnect()
def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1] idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname) fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname)) while True: try: log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename)) cursor = self._src.client()[src_dbname][src_collname].find(filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) count = cursor.count() if count == 0: log.info(' skip empty collection') return n = 0 actions = [] actions_max = 20 groups = [] groups_max = 10 for doc in cursor: id = str(doc['_id']) del doc['_id'] source = gen_doc_with_fields(doc, fields) if fields else doc if source: actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source}) if len(actions) == actions_max: groups.append(actions) actions = [] if len(groups) == groups_max: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)] gevent.joinall(threads) groups = [] n += 1 if n % 1000 == 0: log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) if len(groups) > 0: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))] gevent.joinall(threads) if len(actions) > 0: elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions) log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) return except pymongo.errors.AutoReconnect: self._src.reconnect()
def _sync_oplog(self, oplog_start): """ Replay oplog. """ self._last_bulk_optime = oplog_start n_total = 0 n_skip = 0 while True: # try to get cursor until success try: host, port = self._src.client().address log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port)) # set codec options to guarantee the order of keys in command coll = self._src.client()['local'].get_collection( 'oplog.rs', codec_options=bson.codec_options.CodecOptions( document_class=bson.son.SON)) cursor = coll.find( {'ts': { '$gte': oplog_start }}, cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, no_cursor_timeout=True) # New in version 3.2 # src_version = mongo_utils.get_version(self._src.client()) # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): # cursor.max_await_time_ms(1000) valid_start_optime = False # need to validate while True: try: if not cursor.alive: log.error('cursor is dead') raise pymongo.errors.AutoReconnect oplog = cursor.next() n_total += 1 if not valid_start_optime: if oplog['ts'] == oplog_start: log.info('oplog is ok: %s' % oplog_start) valid_start_optime = True else: log.error('oplog %s is stale, terminate' % oplog_start) return # validate oplog if not self._conf.data_filter.valid_oplog(oplog): n_skip += 1 self._last_optime = oplog['ts'] continue op = oplog['op'] ns = oplog['ns'] if op == 'i': # insert dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) fields = self._conf.fieldmap.get( gen_namespace(dbname, collname)) doc = oplog['o'] id = str(doc['_id']) del doc['_id'] if fields: doc = gen_doc_with_fields(doc, fields) if doc: self._action_buf.append({ '_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc }) elif op == 'u': # update dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) fields = self._conf.fieldmap.get( gen_namespace(dbname, collname)) id = str(oplog['o2']['_id']) if '$set' in oplog['o']: doc = {} for k, v in oplog['o']['$set'].iteritems(): if not fields or k in fields: sub_doc = doc_flat_to_nested( k.split('.'), v) merge_doc(doc, sub_doc) if doc: self._action_buf.append({ '_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'doc': doc, 'doc_as_upsert': True }) if '$unset' in oplog['o']: script_statements = [] for keypath in oplog['o']['$unset'].iterkeys(): if not fields or keypath in fields: pos = keypath.rfind('.') if pos >= 0: script_statements.append( 'ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos + 1:])) else: script_statements.append( 'ctx._source.remove("%s")' % keypath) if script_statements: doc = { 'script': '; '.join(script_statements) } self._action_buf.append({ '_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'script': doc['script'] }) if '$set' not in oplog[ 'o'] and '$unset' not in oplog['o']: log.warn('unexpect oplog: %s', oplog['o']) elif op == 'd': # delete dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) id = str(oplog['o']['_id']) self._action_buf.append({ '_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id }) elif op == 'c': # command dbname, _ = parse_namespace(ns) idxname = self._conf.db_mapping(dbname) if 'drop' in oplog['o']: # TODO # how to delete type? pass log.warn( 'you should implement document type deletion.' ) if 'dropDatabase' in oplog['o']: # delete index self._dst.client().indices.delete( index=idxname) elif op == 'n': # no-op pass else: log.error('invalid optype: %s' % oplog) # flush if self._action_buf_full(): self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = oplog['ts'] self._last_optime = oplog['ts'] self._log_optime(oplog['ts']) self._log_progress() except StopIteration as e: # flush if len(self._action_buf) > 0: self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = self._last_optime self._log_optime(self._last_optime) self._log_progress('latest') time.sleep(0.1) except pymongo.errors.AutoReconnect as e: log.error(e) self._src.reconnect() break except elasticsearch.helpers.BulkIndexError as e: log.error(e) self._action_buf = [] except IndexError as e: log.error(e) log.error('%s not found, terminate' % oplog_start) return
def load(filepath): """ Load config file and generate conf. """ conf = Config() tml = toml.load(filepath) conf.src_conf = MongoConfig(tml['src']['hosts'], tml['src'].get('authdb', 'admin'), tml['src'].get('username', ''), tml['src'].get('password', '')) if type not in tml['dst'] or tml['dst']['type'] == 'mongo': conf.dst_conf = MongoConfig(tml['dst']['hosts'], tml['dst'].get('authdb', 'admin'), tml['dst'].get('username', ''), tml['dst'].get('password', '')) elif tml['dst']['type'] == 'es': conf.dst_conf = EsConfig(tml['dst']['hosts']) else: raise Exception('invalid dst.type') if 'sync' in tml and 'dbs' in tml['sync']: for dbentry in tml['sync']['dbs']: if 'db' not in dbentry: raise Exception("'db' is missing in sync.dbs") if not dbentry['db']: raise Exception("'db' is empty in sync.dbs") dbname = dbentry['db'].strip() rename_db = dbentry['rename_db'].strip( ) if 'rename_db' in dbentry else "" # update db map if dbname and rename_db: if dbname in conf.dbmap: raise Exception('duplicate dbname in sync.dbs: %s' % dbname) conf.dbmap[dbname] = rename_db if 'colls' in dbentry and dbentry['colls']: for collentry in dbentry['colls']: if isinstance(collentry, str): collname = collentry.strip() ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) elif isinstance(collentry, dict): if 'coll' not in collentry: raise Exception( "'coll' is missing in sync.dbs.colls") if not collentry['coll']: raise Exception( "'coll' is empty in sync.dbs.colls") collname = collentry['coll'].strip() fields = frozenset( [f.strip() for f in collentry['fields']] if 'fields' in collentry else []) # update coll filter ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) # update fields if fields: if ns in conf.fieldmap: raise Exception( "duplicate collname in sync.dbs.colls: %s" % ns) conf.fieldmap[ns] = fields else: raise Exception( 'invalid entry in sync.dbs.colls: %s' % collentry) else: # update coll filter conf.data_filter.add_include_coll( gen_namespace(dbname, '*')) if 'sync' in tml and 'start_optime' in tml['sync']: conf.start_optime = Timestamp(tml['sync']['start_optime'], 0) if 'log' in tml and 'filepath' in tml['log']: conf.logfilepath = tml['log']['filepath'] if 'log' in tml and 'op_time_path' in tml['log']: conf.optime_logfilepath = tml['log']['op_time_path'] optime_logger = OptimeLogger(conf.optime_logfilepath) if optime_logger.read(): conf.start_optime = optime_logger.read() return conf
def load(filepath): """ Load config file and generate conf. """ conf = Config() tml = toml.load(filepath) conf.src_conf = MongoConfig(tml['src']['hosts'], tml['src'].get('authdb', 'admin'), tml['src'].get('username', ''), tml['src'].get('password', '')) if type not in tml['dst'] or tml['dst']['type'] == 'mongo': conf.dst_conf = MongoConfig(tml['dst']['hosts'], tml['dst'].get('authdb', 'admin'), tml['dst'].get('username', ''), tml['dst'].get('password', '')) elif tml['dst']['type'] == 'es': conf.dst_conf = EsConfig(tml['dst']['hosts']) else: raise Exception('invalid dst.type') if 'sync' in tml and 'dbs' in tml['sync']: for dbentry in tml['sync']['dbs']: if 'db' not in dbentry: raise Exception("'db' is missing in sync.dbs") if not dbentry['db']: raise Exception("'db' is empty in sync.dbs") dbname = dbentry['db'].strip() rename_db = dbentry['rename_db'].strip() if 'rename_db' in dbentry else "" # update db map if dbname and rename_db: if dbname in conf.dbmap: raise Exception('duplicate dbname in sync.dbs: %s' % dbname) conf.dbmap[dbname] = rename_db if 'colls' in dbentry and dbentry['colls']: for collentry in dbentry['colls']: if isinstance(collentry, str) or isinstance(collentry, unicode): collname = collentry.strip() ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) elif isinstance(collentry, dict): if 'coll' not in collentry: raise Exception("'coll' is missing in sync.dbs.colls") if not collentry['coll']: raise Exception("'coll' is empty in sync.dbs.colls") collname = collentry['coll'].strip() fields = frozenset([f.strip() for f in collentry['fields']] if 'fields' in collentry else []) # update coll filter ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) # update fields if fields: if ns in conf.fieldmap: raise Exception("duplicate collname in sync.dbs.colls: %s" % ns) conf.fieldmap[ns] = fields else: raise Exception('invalid entry in sync.dbs.colls: %s' % collentry) else: # update coll filter conf.data_filter.add_include_coll(gen_namespace(dbname, '*')) if 'sync' in tml and 'start_optime' in tml['sync']: conf.start_optime = Timestamp(tml['sync']['start_optime'], 0) if 'log' in tml and 'filepath' in tml['log']: conf.logfilepath = tml['log']['filepath'] return conf
def load(filepath): """ Load config file and generate conf. """ conf = Config() tml = toml.load(filepath) conf.src_conf = MongoConfig( tml['src']['hosts'], authdb=tml['src'].get('authdb', 'admin'), # default authdb is 'admin' username=tml['src'].get('username', ''), password=tml['src'].get('password', '')) if tml['dst']['type'] == 'mongo': conf.dst_conf = MongoConfig( tml['dst']['mongo']['hosts'], authdb=tml['dst']['mongo'].get( 'authdb', 'admin'), # default authdb is 'admin' username=tml['dst']['mongo'].get('username', ''), password=tml['dst']['mongo'].get('password', '')) elif tml['dst']['type'] == 'es': conf.dst_conf = EsConfig(tml['dst']['es']['hosts']) if 'sync' in tml and 'dbs' in tml['sync']: for dbentry in tml['sync']['dbs']: if 'db' not in dbentry: raise Exception("required option 'db' is missing") if not dbentry['db']: raise Exception("required option 'db' is empty") dbname = dbentry['db'].strip() rename_db = dbentry['rename_db'].strip( ) if 'rename_db' in dbentry else "" # update db map if dbname and rename_db: if dbname in conf.dbmap: raise Exception("conflict dbname in 'sync.dbs': %s" % dbname) conf.dbmap[dbname] = rename_db if 'colls' in dbentry and dbentry['colls']: for collentry in dbentry['colls']: if 'coll' not in collentry: raise Exception( "required option 'coll' is missing") if not collentry['coll']: raise Exception("required option 'coll' is empty") collname = collentry['coll'].strip() fields = frozenset( [f.strip() for f in collentry['fields']] if 'fields' in collentry else []) if collname == '*' and fields: raise Exception( "'fileds' should be empty if 'coll' is '*'") if collname == '*': # update coll filter conf.data_filter.add_include_coll( gen_namespace(dbname, '*')) else: # update coll filter ns = gen_namespace(dbname, collname) conf.data_filter.add_include_coll(ns) # update fields if fields: if ns in conf.fieldmap: raise Exception( "conflict namespace in 'sync.colls': %s" % ns) conf.fieldmap[ns] = fields else: # update coll filter conf.data_filter.add_include_coll( gen_namespace(dbname, '*')) if 'sync' in tml and 'start_optime' in tml['sync']: conf.start_optime = Timestamp(tml['sync']['start_optime'], 0) if 'log' in tml and 'filepath' in tml['log']: conf.logfilepath = tml['log']['filepath'] return conf
def _replay_oplog(self, oplog_start): """ Replay oplog. """ self._last_bulk_optime = oplog_start n_total = 0 n_skip = 0 while True: # try to get cursor until success try: host, port = self._src.client().address log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port)) # set codec options to guarantee the order of keys in command coll = self._src.client()['local'].get_collection('oplog.rs', codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON)) cursor = coll.find({'ts': {'$gte': oplog_start}}, cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, no_cursor_timeout=True) # New in version 3.2 # src_version = mongo_utils.get_version(self._src.client()) # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): # cursor.max_await_time_ms(1000) valid_start_optime = False # need to validate while True: try: if not cursor.alive: log.error('cursor is dead') raise pymongo.errors.AutoReconnect oplog = cursor.next() n_total += 1 if not valid_start_optime: if oplog['ts'] == oplog_start: log.info('oplog is ok: %s' % oplog_start) valid_start_optime = True else: log.error('oplog %s is stale, terminate' % oplog_start) return # validate oplog if not self._conf.data_filter.valid_oplog(oplog): n_skip += 1 self._last_optime = oplog['ts'] continue op = oplog['op'] ns = oplog['ns'] if op == 'i': # insert dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) doc = oplog['o'] id = str(doc['_id']) del doc['_id'] if fields: doc = gen_doc_with_fields(doc, fields) if doc: self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc}) elif op == 'u': # update dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) id = str(oplog['o2']['_id']) if '$set' in oplog['o']: doc = {} for k, v in oplog['o']['$set'].iteritems(): if not fields or k in fields: sub_doc = doc_flat_to_nested(k.split('.'), v) merge_doc(doc, sub_doc) if doc: self._action_buf.append({'_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'doc': doc, 'doc_as_upsert': True}) if '$unset' in oplog['o']: script_statements = [] for keypath in oplog['o']['$unset'].iterkeys(): if not fields or keypath in fields: pos = keypath.rfind('.') if pos >= 0: script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:])) else: script_statements.append('ctx._source.remove("%s")' % keypath) if script_statements: doc = {'script': '; '.join(script_statements)} self._action_buf.append({'_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'script': doc['script']}) if '$set' not in oplog['o'] and '$unset' not in oplog['o']: log.warn('unexpect oplog: %s', oplog['o']) elif op == 'd': # delete dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) id = str(oplog['o']['_id']) self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id}) elif op == 'c': # command dbname, _ = parse_namespace(ns) idxname = self._conf.db_mapping(dbname) if 'drop' in oplog['o']: # TODO # how to delete type? pass log.warn('you should implement document type deletion.') if 'dropDatabase' in oplog['o']: # delete index self._dst.client().indices.delete(index=idxname) elif op == 'n': # no-op pass else: log.error('invalid optype: %s' % oplog) # flush if self._action_buf_full(): self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = oplog['ts'] self._last_optime = oplog['ts'] self._log_optime(oplog['ts']) self._log_progress() except StopIteration as e: # flush if len(self._action_buf) > 0: self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = self._last_optime self._log_optime(self._last_optime) self._log_progress('latest') time.sleep(0.1) except pymongo.errors.AutoReconnect as e: log.error(e) self._src.reconnect() break except elasticsearch.helpers.BulkIndexError as e: log.error(e) self._action_buf = [] except IndexError as e: log.error(e) log.error('%s not found, terminate' % oplog_start) return