def __init__(self, conf): CommonSyncer.__init__(self, conf) if not isinstance(self._conf.src_conf, MongoConfig): raise RuntimeError('invalid src config type') self._src = MongoHandler(self._conf.src_conf) if not self._src.connect(): raise RuntimeError('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) if not isinstance(self._conf.dst_conf, MongoConfig): raise RuntimeError('invalid dst config type') self._dst = MongoHandler(self._conf.dst_conf) if not self._dst.connect(): raise RuntimeError('connect to mongodb(dst) failed: %s' % self._conf.dst_hostportstr) self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10)
def __init__(self, conf): CommonSyncer.__init__(self, conf) if not isinstance(self._conf.src_conf, MongoConfig): raise Exception('invalid src config type') self._src = MongoHandler(self._conf.src_conf) if not self._src.connect(): raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) if not isinstance(self._conf.dst_conf, EsConfig): raise Exception('invalid dst config type') self._dst = EsHandler(self._conf.dst_conf) if not self._dst.connect(): raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr) self._action_buf = [] # used to bulk write oplogs self._last_bulk_optime = None
class MongoSyncer(CommonSyncer): """ MongoDB synchronizer. """ def __init__(self, conf): CommonSyncer.__init__(self, conf) if not isinstance(self._conf.src_conf, MongoConfig): raise RuntimeError('invalid src config type') self._src = MongoHandler(self._conf.src_conf) if not self._src.connect(): raise RuntimeError('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) if not isinstance(self._conf.dst_conf, MongoConfig): raise RuntimeError('invalid dst config type') self._dst = MongoHandler(self._conf.dst_conf) if not self._dst.connect(): raise RuntimeError('connect to mongodb(dst) failed: %s' % self._conf.dst_hostportstr) self._multi_oplog_replayer = MultiOplogReplayer(self._dst, 10) def _create_index(self, namespace_tuple): """ Create indexes. """ def format(key_direction_list): """ Format key and direction of index. """ res = [] for key, direction in key_direction_list: if isinstance(direction, float) or isinstance(direction, long): direction = int(direction) res.append((key, direction)) return res dbname, collname = namespace_tuple dst_dbname, dst_collname = self._conf.db_coll_mapping(dbname, collname) index_info = self._src.client()[dbname][collname].index_information() for name, info in index_info.iteritems(): keys = info['key'] options = {} options['name'] = name if 'unique' in info: options['unique'] = info['unique'] if 'sparse' in info: options['sparse'] = info['sparse'] if 'expireAfterSeconds' in info: options['expireAfterSeconds'] = info['expireAfterSeconds'] if 'partialFilterExpression' in info: options['partialFilterExpression'] = info[ 'partialFilterExpression'] if 'dropDups' in info: options['dropDups'] = info['dropDups'] ## create indexes before import documents, so not need 'background' option # if 'background' in info: # options['background'] = info['background'] # for text index if 'weights' in info: options['weights'] = info['weights'] if 'default_language' in info: options['default_language'] = info['default_language'] if 'language_override' in info: options['language_override'] = info['language_override'] self._dst.create_index(dst_dbname, dst_collname, format(keys), **options) def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ # create indexes first self._create_index(namespace_tuple) src_dbname, src_collname = namespace_tuple dst_dbname, dst_collname = self._conf.db_coll_mapping( src_dbname, src_collname) src_ns = '%s.%s' % (src_dbname, src_collname) total = self._src.client()[src_dbname][src_collname].count() self._progress_logger.register(src_ns, total) while True: try: cursor = self._src.client()[src_dbname][src_collname].find( filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) reqs = [] reqs_max = 100 groups = [] groups_max = 10 n = 0 for doc in cursor: reqs.append( pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True)) if len(reqs) == reqs_max: groups.append(reqs) reqs = [] if len(groups) == groups_max: threads = [ gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(groups_max) ] gevent.joinall(threads, raise_error=True) groups = [] n += 1 if n % 10000 == 0: self._progress_logger.add(src_ns, n) n = 0 if len(groups) > 0: threads = [ gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(len(groups)) ] gevent.joinall(threads, raise_error=True) if len(reqs) > 0: self._dst.bulk_write(dst_dbname, dst_collname, reqs, ordered=False, ignore_duplicate_key_error=True) self._progress_logger.add(src_ns, n, done=True) return except pymongo.errors.AutoReconnect: self._src.reconnect() def _sync_large_collection(self, namespace_tuple, split_points): """ Sync large collection. """ # create indexes first self._create_index(namespace_tuple) dbname, collname = namespace_tuple ns = '.'.join(namespace_tuple) log.info('pending to sync %s with %d processes' % (ns, len(split_points) + 1)) coll = self._src.client()[dbname][collname] total = coll.count() self._progress_logger.register(ns, total) prog_q = multiprocessing.Queue() res_q = multiprocessing.Queue() proc_logging = multiprocessing.Process(target=logging_progress, args=(ns, total, prog_q)) proc_logging.start() queries = [] lower_bound = None for point in split_points: if lower_bound is None: queries.append({'_id': {'$lt': point}}) else: queries.append({'_id': {'$gte': lower_bound, '$lt': point}}) lower_bound = point queries.append({'_id': {'$gte': lower_bound}}) procs = [] for query in queries: p = multiprocessing.Process( target=self._sync_collection_with_query, args=(namespace_tuple, query, prog_q, res_q)) p.start() procs.append(p) log.info('start process %s with query %s' % (p.name, query)) for p in procs: p.join() n_docs = 0 for p in procs: n_docs += res_q.get() self._progress_logger.add(ns, n_docs, done=True) prog_q.put(True) prog_q.close() prog_q.join_thread() proc_logging.join() def _sync_collection_with_query(self, namespace_tuple, query, prog_q, res_q): """ Sync collection with query. """ self._src.reconnect() self._dst.reconnect() src_dbname, src_collname = namespace_tuple dst_dbname, dst_collname = self._conf.db_coll_mapping( src_dbname, src_collname) while True: try: cursor = self._src.client()[src_dbname][src_collname].find( filter=query, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, # snapshot cause blocking, maybe bug # modifiers={'$snapshot': True} ) total = 0 n = 0 reqs = [] reqs_max = 100 groups = [] groups_max = 10 for doc in cursor: reqs.append( pymongo.ReplaceOne({'_id': doc['_id']}, doc, upsert=True)) if len(reqs) == reqs_max: groups.append(reqs) reqs = [] if len(groups) == groups_max: threads = [ gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(groups_max) ] gevent.joinall(threads, raise_error=True) groups = [] n += 1 total += 1 if n % 10000 == 0: prog_q.put(n) n = 0 if len(groups) > 0: threads = [ gevent.spawn(self._dst.bulk_write, dst_dbname, dst_collname, groups[i], ordered=False, ignore_duplicate_key_error=True) for i in xrange(len(groups)) ] gevent.joinall(threads, raise_error=True) if len(reqs) > 0: self._dst.bulk_write(dst_dbname, dst_collname, reqs, ordered=False, ignore_duplicate_key_error=True) if n > 0: prog_q.put(n) res_q.put(total) prog_q.close() prog_q.join_thread() res_q.close() res_q.join_thread() return except pymongo.errors.AutoReconnect: self._src.reconnect() def _replay_oplog(self, start_optime): """ Replay oplog. """ self._last_optime = start_optime n_total = 0 n_skip = 0 while True: try: start_optime_valid = False need_log = False host, port = self._src.client().address log.info('try to sync oplog from %s on %s:%d' % (self._last_optime, host, port)) cursor = self._src.tail_oplog(start_optime) except IndexError as e: log.error(e) log.error('%s not found, terminate' % self._last_optime) return except Exception as e: log.error('get oplog cursor failed: %s' % e) continue # loop: read and apply oplog while True: try: if need_log: self._log_optime(self._last_optime) self._log_progress() need_log = False if not cursor.alive: log.error('cursor is dead') raise pymongo.errors.AutoReconnect oplog = cursor.next() n_total += 1 # check start optime once if not start_optime_valid: if oplog['ts'] == self._last_optime: log.info('oplog is ok: %s' % self._last_optime) start_optime_valid = True else: log.error('oplog %s is stale, terminate' % self._last_optime) return if oplog['op'] == 'n': # no-op self._last_optime = oplog['ts'] need_log = True continue # validate oplog if not self._conf.data_filter.valid_oplog(oplog): n_skip += 1 self._last_optime = oplog['ts'] need_log = True continue dbname, collname = mongo_utils.parse_namespace(oplog['ns']) dst_dbname, dst_collname = self._conf.db_coll_mapping( dbname, collname) if dst_dbname != dbname or dst_collname != collname: oplog['ns'] = '%s.%s' % (dst_dbname, dst_collname) if self._stage == Stage.post_initial_sync: if self._multi_oplog_replayer: if mongo_utils.is_command(oplog): self._multi_oplog_replayer.apply( ignore_duplicate_key_error=True) self._multi_oplog_replayer.clear() self._dst.apply_oplog(oplog) self._last_optime = oplog['ts'] need_log = True else: self._multi_oplog_replayer.push(oplog) if oplog[ 'ts'] == self._initial_sync_end_optime or self._multi_oplog_replayer.count( ) == self._oplog_batchsize: self._multi_oplog_replayer.apply( ignore_duplicate_key_error=True) self._multi_oplog_replayer.clear() self._last_optime = oplog['ts'] need_log = True else: self._dst.apply_oplog( oplog, ignore_duplicate_key_error=True) self._last_optime = oplog['ts'] need_log = True if oplog['ts'] == self._initial_sync_end_optime: log.info('step into stage: oplog_sync') self._stage = Stage.oplog_sync else: if self._multi_oplog_replayer: if mongo_utils.is_command(oplog): self._multi_oplog_replayer.apply() self._multi_oplog_replayer.clear() self._dst.apply_oplog(oplog) self._last_optime = oplog['ts'] need_log = True else: self._multi_oplog_replayer.push(oplog) if self._multi_oplog_replayer.count( ) == self._oplog_batchsize: self._multi_oplog_replayer.apply() self._multi_oplog_replayer.clear() self._last_optime = oplog['ts'] need_log = True else: self._dst.apply_oplog(oplog) self._last_optime = oplog['ts'] need_log = True except StopIteration as e: if self._multi_oplog_replayer and self._multi_oplog_replayer.count( ) > 0: self._multi_oplog_replayer.apply() self._multi_oplog_replayer.clear() self._last_optime = self._multi_oplog_replayer.last_optime( ) need_log = True # no more oplogs, wait a moment time.sleep(0.1) self._log_optime(self._last_optime) self._log_progress('latest') except pymongo.errors.DuplicateKeyError as e: if self._stage == Stage.oplog_sync: log.error(e) log.error('terminate') return else: log.error('ignore duplicate key error: %s' % e) continue except pymongo.errors.AutoReconnect as e: log.error(e) self._src.reconnect() break
class EsSyncer(CommonSyncer): """ Elasticsearch synchronizer. """ def __init__(self, conf): CommonSyncer.__init__(self, conf) if not isinstance(self._conf.src_conf, MongoConfig): raise Exception('invalid src config type') self._src = MongoHandler(self._conf.src_conf) if not self._src.connect(): raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) if not isinstance(self._conf.dst_conf, EsConfig): raise Exception('invalid dst config type') self._dst = EsHandler(self._conf.dst_conf) if not self._dst.connect(): raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr) self._action_buf = [] # used to bulk write oplogs self._last_bulk_optime = None def _action_buf_full(self): return len(self._action_buf) >= 40 def _sync_database(self, dbname): """ Sync a database. """ log.info("sync database '%s'" % dbname) # create index idxname = self._conf.db_mapping(dbname) if self._dst.client().indices.exists(index=idxname): log.info('index already existed: %s' % idxname) else: log.info('create index: %s' % idxname) self._dst.client().indices.create(index=idxname) self._sync_collections(dbname) def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1] idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname) fields = self._conf.fieldmap.get( gen_namespace(src_dbname, src_collname)) while True: try: log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename)) cursor = self._src.client()[src_dbname][src_collname].find( filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) count = cursor.count() if count == 0: log.info(' skip empty collection') return n = 0 actions = [] actions_max = 20 groups = [] groups_max = 10 for doc in cursor: id = str(doc['_id']) del doc['_id'] source = gen_doc_with_fields(doc, fields) if fields else doc if source: actions.append({ '_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source }) if len(actions) == actions_max: groups.append(actions) actions = [] if len(groups) == groups_max: threads = [ gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max) ] gevent.joinall(threads) groups = [] n += 1 if n % 1000 == 0: log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100)) if len(groups) > 0: threads = [ gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups)) ] gevent.joinall(threads) if len(actions) > 0: elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions) log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n) / count * 100)) return except pymongo.errors.AutoReconnect: self._src.reconnect() def _replay_oplog(self, oplog_start): """ Replay oplog. """ self._last_bulk_optime = oplog_start n_total = 0 n_skip = 0 while True: # try to get cursor until success try: host, port = self._src.client().address log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port)) # set codec options to guarantee the order of keys in command coll = self._src.client()['local'].get_collection( 'oplog.rs', codec_options=bson.codec_options.CodecOptions( document_class=bson.son.SON)) cursor = coll.find( {'ts': { '$gte': oplog_start }}, cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, no_cursor_timeout=True) # New in version 3.2 # src_version = mongo_utils.get_version(self._src.client()) # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): # cursor.max_await_time_ms(1000) valid_start_optime = False # need to validate while True: try: if not cursor.alive: log.error('cursor is dead') raise pymongo.errors.AutoReconnect oplog = cursor.next() n_total += 1 if not valid_start_optime: if oplog['ts'] == oplog_start: log.info('oplog is ok: %s' % oplog_start) valid_start_optime = True else: log.error('oplog %s is stale, terminate' % oplog_start) return # validate oplog if not self._conf.data_filter.valid_oplog(oplog): n_skip += 1 self._last_optime = oplog['ts'] continue op = oplog['op'] ns = oplog['ns'] if op == 'i': # insert dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) fields = self._conf.fieldmap.get( gen_namespace(dbname, collname)) doc = oplog['o'] id = str(doc['_id']) del doc['_id'] if fields: doc = gen_doc_with_fields(doc, fields) if doc: self._action_buf.append({ '_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc }) elif op == 'u': # update dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) fields = self._conf.fieldmap.get( gen_namespace(dbname, collname)) id = str(oplog['o2']['_id']) if '$set' in oplog['o']: doc = {} for k, v in oplog['o']['$set'].iteritems(): if not fields or k in fields: sub_doc = doc_flat_to_nested( k.split('.'), v) merge_doc(doc, sub_doc) if doc: self._action_buf.append({ '_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'doc': doc, 'doc_as_upsert': True }) if '$unset' in oplog['o']: script_statements = [] for keypath in oplog['o']['$unset'].iterkeys(): if not fields or keypath in fields: pos = keypath.rfind('.') if pos >= 0: script_statements.append( 'ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos + 1:])) else: script_statements.append( 'ctx._source.remove("%s")' % keypath) if script_statements: doc = { 'script': '; '.join(script_statements) } self._action_buf.append({ '_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'script': doc['script'] }) if '$set' not in oplog[ 'o'] and '$unset' not in oplog['o']: log.warn('unexpect oplog: %s', oplog['o']) elif op == 'd': # delete dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping( dbname, collname) id = str(oplog['o']['_id']) self._action_buf.append({ '_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id }) elif op == 'c': # command dbname, _ = parse_namespace(ns) idxname = self._conf.db_mapping(dbname) if 'drop' in oplog['o']: # TODO # how to delete type? pass log.warn( 'you should implement document type deletion.' ) if 'dropDatabase' in oplog['o']: # delete index self._dst.client().indices.delete( index=idxname) elif op == 'n': # no-op pass else: log.error('invalid optype: %s' % oplog) # flush if self._action_buf_full(): self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = oplog['ts'] self._last_optime = oplog['ts'] self._log_optime(oplog['ts']) self._log_progress() except StopIteration as e: # flush if len(self._action_buf) > 0: self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = self._last_optime self._log_optime(self._last_optime) self._log_progress('latest') time.sleep(0.1) except pymongo.errors.AutoReconnect as e: log.error(e) self._src.reconnect() break except elasticsearch.helpers.BulkIndexError as e: log.error(e) self._action_buf = [] except IndexError as e: log.error(e) log.error('%s not found, terminate' % oplog_start) return
class EsSyncer(CommonSyncer): """ Elasticsearch synchronizer. """ def __init__(self, conf): CommonSyncer.__init__(self, conf) if not isinstance(self._conf.src_conf, MongoConfig): raise Exception('invalid src config type') self._src = MongoHandler(self._conf.src_conf) if not self._src.connect(): raise Exception('connect to mongodb(src) failed: %s' % self._conf.src_hostportstr) if not isinstance(self._conf.dst_conf, EsConfig): raise Exception('invalid dst config type') self._dst = EsHandler(self._conf.dst_conf) if not self._dst.connect(): raise Exception('connect to elasticsearch(dst) failed: %s' % self._conf.dst_hostportstr) self._action_buf = [] # used to bulk write oplogs self._last_bulk_optime = None def _action_buf_full(self): return len(self._action_buf) >= 40 def _sync_database(self, dbname): """ Sync a database. """ log.info("sync database '%s'" % dbname) # create index idxname = self._conf.db_mapping(dbname) if self._dst.client().indices.exists(index=idxname): log.info('index already existed: %s' % idxname) else: log.info('create index: %s' % idxname) self._dst.client().indices.create(index=idxname) self._sync_collections(dbname) def _sync_collection(self, namespace_tuple): """ Sync a collection until success. """ src_dbname, src_collname = namespace_tuple[0], namespace_tuple[1] idxname, typename = self._conf.db_coll_mapping(src_dbname, src_collname) fields = self._conf.fieldmap.get(gen_namespace(src_dbname, src_collname)) while True: try: log.info("sync collection '%s.%s' => '%s.%s'" % (src_dbname, src_collname, idxname, typename)) cursor = self._src.client()[src_dbname][src_collname].find(filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) count = cursor.count() if count == 0: log.info(' skip empty collection') return n = 0 actions = [] actions_max = 20 groups = [] groups_max = 10 for doc in cursor: id = str(doc['_id']) del doc['_id'] source = gen_doc_with_fields(doc, fields) if fields else doc if source: actions.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': source}) if len(actions) == actions_max: groups.append(actions) actions = [] if len(groups) == groups_max: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(groups_max)] gevent.joinall(threads) groups = [] n += 1 if n % 1000 == 0: log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) if len(groups) > 0: threads = [gevent.spawn(self._dst.bulk_write, groups[i]) for i in xrange(len(groups))] gevent.joinall(threads) if len(actions) > 0: elasticsearch.helpers.bulk(client=self._dst.client(), actions=actions) log.info(' %s.%s %d/%d (%.2f%%)' % (src_dbname, src_collname, n, count, float(n)/count*100)) return except pymongo.errors.AutoReconnect: self._src.reconnect() def _replay_oplog(self, oplog_start): """ Replay oplog. """ self._last_bulk_optime = oplog_start n_total = 0 n_skip = 0 while True: # try to get cursor until success try: host, port = self._src.client().address log.info('try to sync oplog from %s on %s:%d' % (self._last_bulk_optime, host, port)) # set codec options to guarantee the order of keys in command coll = self._src.client()['local'].get_collection('oplog.rs', codec_options=bson.codec_options.CodecOptions(document_class=bson.son.SON)) cursor = coll.find({'ts': {'$gte': oplog_start}}, cursor_type=pymongo.cursor.CursorType.TAILABLE_AWAIT, no_cursor_timeout=True) # New in version 3.2 # src_version = mongo_utils.get_version(self._src.client()) # if mongo_utils.version_higher_or_equal(src_version, '3.2.0'): # cursor.max_await_time_ms(1000) valid_start_optime = False # need to validate while True: try: if not cursor.alive: log.error('cursor is dead') raise pymongo.errors.AutoReconnect oplog = cursor.next() n_total += 1 if not valid_start_optime: if oplog['ts'] == oplog_start: log.info('oplog is ok: %s' % oplog_start) valid_start_optime = True else: log.error('oplog %s is stale, terminate' % oplog_start) return # validate oplog if not self._conf.data_filter.valid_oplog(oplog): n_skip += 1 self._last_optime = oplog['ts'] continue op = oplog['op'] ns = oplog['ns'] if op == 'i': # insert dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) doc = oplog['o'] id = str(doc['_id']) del doc['_id'] if fields: doc = gen_doc_with_fields(doc, fields) if doc: self._action_buf.append({'_op_type': 'index', '_index': idxname, '_type': typename, '_id': id, '_source': doc}) elif op == 'u': # update dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) fields = self._conf.fieldmap.get(gen_namespace(dbname, collname)) id = str(oplog['o2']['_id']) if '$set' in oplog['o']: doc = {} for k, v in oplog['o']['$set'].iteritems(): if not fields or k in fields: sub_doc = doc_flat_to_nested(k.split('.'), v) merge_doc(doc, sub_doc) if doc: self._action_buf.append({'_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'doc': doc, 'doc_as_upsert': True}) if '$unset' in oplog['o']: script_statements = [] for keypath in oplog['o']['$unset'].iterkeys(): if not fields or keypath in fields: pos = keypath.rfind('.') if pos >= 0: script_statements.append('ctx._source.%s.remove("%s")' % (keypath[:pos], keypath[pos+1:])) else: script_statements.append('ctx._source.remove("%s")' % keypath) if script_statements: doc = {'script': '; '.join(script_statements)} self._action_buf.append({'_op_type': 'update', '_index': idxname, '_type': typename, '_id': id, '_retry_on_conflict': 3, 'script': doc['script']}) if '$set' not in oplog['o'] and '$unset' not in oplog['o']: log.warn('unexpect oplog: %s', oplog['o']) elif op == 'd': # delete dbname, collname = parse_namespace(ns) idxname, typename = self._conf.db_coll_mapping(dbname, collname) id = str(oplog['o']['_id']) self._action_buf.append({'_op_type': 'delete', '_index': idxname, '_type': typename, '_id': id}) elif op == 'c': # command dbname, _ = parse_namespace(ns) idxname = self._conf.db_mapping(dbname) if 'drop' in oplog['o']: # TODO # how to delete type? pass log.warn('you should implement document type deletion.') if 'dropDatabase' in oplog['o']: # delete index self._dst.client().indices.delete(index=idxname) elif op == 'n': # no-op pass else: log.error('invalid optype: %s' % oplog) # flush if self._action_buf_full(): self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = oplog['ts'] self._last_optime = oplog['ts'] self._log_optime(oplog['ts']) self._log_progress() except StopIteration as e: # flush if len(self._action_buf) > 0: self._dst.bulk_write(self._action_buf) self._action_buf = [] self._last_bulk_optime = self._last_optime self._log_optime(self._last_optime) self._log_progress('latest') time.sleep(0.1) except pymongo.errors.AutoReconnect as e: log.error(e) self._src.reconnect() break except elasticsearch.helpers.BulkIndexError as e: log.error(e) self._action_buf = [] except IndexError as e: log.error(e) log.error('%s not found, terminate' % oplog_start) return