if os.path.exists(args.source): sources = utils.parse_source_file(args.source) else: sources = [utils.parse_mongo_url(args.source)] # initialize sqlite database that holds our state (this may seem like overkill, # but it's actually needed to ensure proper synchronization of subprocesses) if not args.state_db: args.state_db = '%s.%s.db' % (sources[0]['db'], sources[0]['collection']) if args.state_db.startswith('/'): state_db_path = args.state_db else: state_db_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.state_db) log.info('using state db %s' % state_db_path) state_db_exists = os.path.exists(state_db_path) state_db = CopyStateDB(state_db_path) if not state_db_exists: state_db.drop_and_create() if args.restart: state_db.drop_and_create() # do the real work copy_collection_parent(sources, dest, state_db, args) log.error("shouldn't reach this point") sys.exit(1)
dest = utils.parse_mongo_url(args.dest) if os.path.exists(args.source): sources = utils.parse_source_file(args.source) else: sources = [utils.parse_mongo_url(args.source)] # initialize sqlite database that holds our state (this may seem like overkill, # but it's actually needed to ensure proper synchronization of subprocesses) if not args.state_db: args.state_db = "%s.%s.db" % (sources[0]["db"], sources[0]["collection"]) if args.state_db.startswith("/"): state_db_path = args.state_db else: state_db_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.state_db) log.info("using state db %s" % state_db_path) state_db_exists = os.path.exists(state_db_path) state_db = CopyStateDB(state_db_path) if not state_db_exists: state_db.drop_and_create() if args.restart: state_db.drop_and_create() # do the real work copy_collection_parent(sources, dest, state_db, args) log.error("shouldn't reach this point") sys.exit(1)
def copy_collection(source, dest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Ensures that the destination is empty before starting the copy. Does no safety checks -- this is up to the caller. @param source dict of (host, port, db, collection) for the source @param dest dict of (host, port, db, collection) for the destination @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect( source['host'], source['port'], ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[source['db']][source['collection']] if source_client.is_mongos: raise Exception( "for performance reasons, sources must be mongod instances; %s:%d is not", source['host'], source['port']) dest_client = utils.mongo_connect(dest['host'], dest['port'], max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] # record timestamp of last oplog entry, so that we know where to start applying ops # later oplog_ts = utils.get_last_oplog_entry(source_client)['ts'] state_db.update_oplog_ts(source, dest, oplog_ts) # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES) return stats = Stats() stats.total_docs = int(source_collection.count()) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent) / 100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find(fields=["_id"], snapshot=True, timeout=False) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() log.info("done with initial copy") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
def copy_collection(source, dest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Ensures that the destination is empty before starting the copy. Does no safety checks -- this is up to the caller. @param source dict of (host, port, db, collection) for the source @param dest dict of (host, port, db, collection) for the destination @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect(source, ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[source['db']][source['collection']] if source_client.is_mongos: raise Exception("for performance reasons, sources must be mongod instances; %s:%d is not", source['host'], source['port']) dest_client = utils.mongo_connect(dest, max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] # record timestamp of last oplog entry, so that we know where to start applying ops # later oplog_ts = utils.get_last_oplog_entry(source_client)['ts'] state_db.update_oplog_ts(source, dest, oplog_ts) # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES) return stats = Stats() stats.total_docs = int(source_collection.count()) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent)/100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find( projection={'_id':True}, modifiers={'$snapshot':True} ) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() log.info("done with initial copy") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
def main(): # NOTE: we are not gevent monkey-patched here; only child processes are monkey-patched, # so all ops below are synchronous # parse command-line options import argparse parser = argparse.ArgumentParser( description='Copy collections from one mongod to another. demo:\n' 'mongo_copier --srchost 192.168.37.12 --srcport 1234 ' '--desthost 192.168.37.14 --destport 5678 --destuser write --destpwd write ' '--manifests /tmp/1.txt --restart --drop', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--srchost', type=str, required=True, metavar='HOST', help='hostname or IP of source mongod') parser.add_argument('--srcport', type=int, required=True, metavar='PORT', help='port of source mongod') parser.add_argument('--desthost', type=str, required=True, metavar='HOST', help='hostname or IP of destination mongod') parser.add_argument('--destport', type=int, required=True, metavar='PORT', help='port of destination mongod') parser.add_argument( '--destuser', type=str, required=True, metavar='USERNAME', help='username of destination mongod, must be writable') parser.add_argument('--destpwd', type=str, required=True, metavar='PASSWORD', help='password of destination mongod') parser.add_argument('--percent', type=int, metavar='PCT', default=None, help='copy only PCT%% of data') parser.add_argument('--drop', action='store_true', help='delete destination collection data before copy') parser.add_argument( '--restart', action='store_true', help='restart from the beginning, ignoring any prior progress') parser.add_argument( '--state-db', type=str, metavar='PATH', default=None, help= 'path to state file (defaults to\n/tmp/mongo_copier_states/<srchost>_<desthost>.db)' ) parser.add_argument( '--manifests', type=str, required=True, metavar='FILE', help='a file containing collections to copy, one strategy per line.\n' 'e.g.\n' 'copy a collection\n' ' dbname.colname\n' 'copy a collection with change destination database or collection name\n' ' dbname.colname newdbname.newcolname\n' 'copy a collection with query\n' ' dbname.colname {"id":{"$in":[123,456]}}\n' 'copy all collections of a database\n' ' dbname.*\n' 'copy all collections of a database with change destination database name\n' ' dbname.* newdbname.*\n' 'copy all collections of a database with query\n' ' dbname.* {"id":{"$in":[123,456]}}\n' 'copy the collection of all databases\n' ' *.dbname\n' 'copy the collection of all databases with change destination collection name\n' ' *.dbname *.newdbname\n' 'copy the collection of all databases with query\n' ' *.dbname {"id":{"$in":[123,456]}}\n' 'copy all collections of all databases\n' ' *.*\n' 'copy all collections of all databases with query\n' ' *.* {"id":{"$in":[123,456]}}\n') args = parser.parse_args() log.debug(args) # parse source and destination if os.path.exists(args.manifests): manifests = utils.parse_manifests_file(args) else: die(log, "manifests not exist: %s" % manifests) # initialize sqlite database that holds our state (this may seem like overkill, # but it's actually needed to ensure proper synchronization of subprocesses) if not args.state_db: args.state_db = '/tmp/mongo_copier_states/%s_%s.db' % (args.srchost, args.desthost) if not os.path.isdir("/tmp/mongo_copier_states/"): os.mkdir("/tmp/mongo_copier_states/") if args.state_db.startswith('/'): state_db_path = args.state_db else: state_db_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), args.state_db) log.info('using state db %s' % state_db_path) state_db_exists = os.path.exists(state_db_path) state_db = CopyStateDB(state_db_path) if not state_db_exists or args.restart: state_db.drop_and_create() # do the real work copy_collection_parent(manifests, state_db, args)
def copy_collection(manifest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Does no safety checks -- this is up to the caller. @param manifest dict of (srchost, srcport, srcuser, srcpwd, srcdb, srccol, desthost, destport, destuser, destpwd, destdb, destcol) @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect( manifest['srchost'], manifest['srcport'], manifest['srcuser'], manifest['srcpwd'], maxPoolSize=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[manifest['srcdb']][manifest['srccol']] if source_client.is_mongos: raise Exception( "for performance reasons, sources must be mongod instances; %s:%d is not", manifest['srchost'], source['srcport']) dest_client = utils.mongo_connect(manifest['desthost'], manifest['destport'], manifest['destuser'], manifest['destpwd'], maxPoolSize=30, document_class=FasterOrderedDict) dest_collection = dest_client[manifest['destdb']][manifest['destcol']] # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG) return stats = Stats() stats.total_docs = int(source_collection.count(filter=manifest["query"])) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent) / 100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find(filter=manifest["query"], projection={"_id": True}, no_cursor_timeout=False) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() srccount = stats.total_docs destcount = dest_collection.count(filter=manifest["query"]) if srccount == destcount: log.info("COPY SUCCEED. srccount(%d) == destcount(%d)" % (srccount, destcount)) else: log.error("COPY FAILED. srccount(%d) != destcount(%d)" % (srccount, destcount)) state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)
def apply_oplog(source, dest, percent, state_path): """ Applies oplog entries from source to destination. Since the oplog storage format has known and possibly unknown idiosyncracies, we take a conservative approach. For each insert or delete op, we can easily replay those. For updates, we do the following: 1. Note the _id of the updated document 2. Retrieved the updated document from the source 3. Upsert the updated document in the destination @param oplog oplog collection from the source mongod instance @param start_ts timestamp at which we should start replaying oplog entries @param source_collection collection we're reading from @param dest_collection collection we're writing to @param checkpoint_ts_func function that, when called, persists oplog timestamp to disk @param """ gevent.monkey.patch_socket() stats = ApplyStats() apply_workers = Pool(20) # connect to state db state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect(source, ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[source['db']][source['collection']] dest_client = utils.mongo_connect(dest, max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] oplog = source_client['local']['oplog.rs'] # print stats periodically stats.paused = True stats_greenlet = gevent.spawn(oplog_stats_worker, stats) # checkpoint oplog position to disk periodically checkpoint_greenlet = gevent.spawn(oplog_checkpoint_worker, stats, source, dest, state_db) # figure out where we need to start reading oplog entries; rewind our oplog timestamp # a bit, to avoid issues with the user pressing Control-C while some ops are pending # # this works, because oplog entries are idempotent start_ts_orig = state_db.get_oplog_ts(source, dest) start_ts = bson.Timestamp(time=start_ts_orig.time-TS_REWIND, inc=0) log.info("starting apply at %s", start_ts) # perform tailing oplog query using the oplog_replay option to efficiently find # our starting position in the oplog query = {} query['ts'] = {'$gte': start_ts} query['ns'] = source_collection.full_name cursor = oplog.find( query, cursor_type=CursorType.TAILABLE_AWAIT, ) cursor.add_option(pymongo.cursor._QUERY_OPTIONS['oplog_replay']) while True: for op in cursor: stats.paused = False _id = _op_id(op) if percent and not utils.id_in_subset(_id, percent): continue stats.ops_retrieved += 1 # block *all* further ops from being applied if there's a pending # op on the current _id, to ensure serialization while _id in stats.pending_ids: gevent.sleep(0.1) stats.sleeps += 1 # do the real oplog work in a greenlet from the pool stats.pending_ids.add(_id) apply_workers.spawn(_apply_op_worker, op, source_collection, dest_collection, stats) # update our last timestamp; this is *not* guaranteed to be the timestamp of the # most recent op, which is impossible because of our out-of-order execution # # this is an approximation that needs to be accurate to within TS_REWIND seconds stats.last_ts = op['ts'] # while we have a tailable cursor, it can stop iteration if no more results come back # in a reasonable time, so sleep for a bit then try to continue iteration if cursor.alive: log.debug("replayed all oplog entries; sleeping...") stats.paused = True gevent.sleep(2) stats.paused = False else: log.error("cursor died on us!") break # just to silence pyflakes... stats_greenlet.kill() checkpoint_greenlet.kill()
def apply_oplog(source, dest, percent, state_path): """ Applies oplog entries from source to destination. Since the oplog storage format has known and possibly unknown idiosyncracies, we take a conservative approach. For each insert or delete op, we can easily replay those. For updates, we do the following: 1. Note the _id of the updated document 2. Retrieved the updated document from the source 3. Upsert the updated document in the destination @param oplog oplog collection from the source mongod instance @param start_ts timestamp at which we should start replaying oplog entries @param source_collection collection we're reading from @param dest_collection collection we're writing to @param checkpoint_ts_func function that, when called, persists oplog timestamp to disk @param """ gevent.monkey.patch_socket() stats = ApplyStats() apply_workers = Pool(20) # connect to state db state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect( source, ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict, ) source_collection = source_client[source["db"]][source["collection"]] dest_client = utils.mongo_connect(dest, max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest["db"]][dest["collection"]] oplog = source_client["local"]["oplog.rs"] # print stats periodically stats.paused = True stats_greenlet = gevent.spawn(oplog_stats_worker, stats) # checkpoint oplog position to disk periodically checkpoint_greenlet = gevent.spawn(oplog_checkpoint_worker, stats, source, dest, state_db) # figure out where we need to start reading oplog entries; rewind our oplog timestamp # a bit, to avoid issues with the user pressing Control-C while some ops are pending # # this works, because oplog entries are idempotent start_ts_orig = state_db.get_oplog_ts(source, dest) start_ts = bson.Timestamp(time=start_ts_orig.time - TS_REWIND, inc=0) log.info("starting apply at %s", start_ts) # perform tailing oplog query using the oplog_replay option to efficiently find # our starting position in the oplog query = {} query["ts"] = {"$gte": start_ts} query["ns"] = source_collection.full_name cursor = oplog.find(query, cursor_type=pymongo.CursorType.TAILABLE_AWAIT, oplog_replay=True) # cursor.add_option(pymongo.cursor._QUERY_OPTIONS['oplog_replay']) print cursor while True: for op in cursor: stats.paused = False _id = _op_id(op) if percent and not utils.id_in_subset(_id, percent): continue stats.ops_retrieved += 1 # block *all* further ops from being applied if there's a pending # op on the current _id, to ensure serialization while _id in stats.pending_ids: gevent.sleep(0.1) stats.sleeps += 1 # do the real oplog work in a greenlet from the pool stats.pending_ids.add(_id) apply_workers.spawn(_apply_op_worker, op, source_collection, dest_collection, stats) # update our last timestamp; this is *not* guaranteed to be the timestamp of the # most recent op, which is impossible because of our out-of-order execution # # this is an approximation that needs to be accurate to within TS_REWIND seconds stats.last_ts = op["ts"] # while we have a tailable cursor, it can stop iteration if no more results come back # in a reasonable time, so sleep for a bit then try to continue iteration if cursor.alive: log.debug("replayed all oplog entries; sleeping...") stats.paused = True gevent.sleep(2) stats.paused = False else: log.error("cursor died on us!") break # just to silence pyflakes... stats_greenlet.kill() checkpoint_greenlet.kill()