def copy_collection(source, dest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Ensures that the destination is empty before starting the copy. Does no safety checks -- this is up to the caller. @param source dict of (host, port, db, collection) for the source @param dest dict of (host, port, db, collection) for the destination @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect(source, ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[source['db']][source['collection']] if source_client.is_mongos: raise Exception("for performance reasons, sources must be mongod instances; %s:%d is not", source['host'], source['port']) dest_client = utils.mongo_connect(dest, max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] # record timestamp of last oplog entry, so that we know where to start applying ops # later oplog_ts = utils.get_last_oplog_entry(source_client)['ts'] state_db.update_oplog_ts(source, dest, oplog_ts) # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES) return stats = Stats() stats.total_docs = int(source_collection.count()) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent)/100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find( projection={'_id':True}, modifiers={'$snapshot':True} ) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() log.info("done with initial copy") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
def copy_collection(source, dest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Ensures that the destination is empty before starting the copy. Does no safety checks -- this is up to the caller. @param source dict of (host, port, db, collection) for the source @param dest dict of (host, port, db, collection) for the destination @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect( source['host'], source['port'], ensure_direct=True, max_pool_size=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[source['db']][source['collection']] if source_client.is_mongos: raise Exception( "for performance reasons, sources must be mongod instances; %s:%d is not", source['host'], source['port']) dest_client = utils.mongo_connect(dest['host'], dest['port'], max_pool_size=30, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] # record timestamp of last oplog entry, so that we know where to start applying ops # later oplog_ts = utils.get_last_oplog_entry(source_client)['ts'] state_db.update_oplog_ts(source, dest, oplog_ts) # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES) return stats = Stats() stats.total_docs = int(source_collection.count()) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent) / 100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find(fields=["_id"], snapshot=True, timeout=False) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() log.info("done with initial copy") state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
def copy_collection(manifest, state_path, percent): """ Copies all documents from source to destination collection. Inserts documents in batches using insert workers, which are each run in their own greenlet. Does no safety checks -- this is up to the caller. @param manifest dict of (srchost, srcport, srcuser, srcpwd, srcdb, srccol, desthost, destport, destuser, destpwd, destdb, destcol) @param state_path path of state database @param percent percentage of documents to copy """ gevent.monkey.patch_socket() # open state database state_db = CopyStateDB(state_path) # connect to mongo source_client = utils.mongo_connect( manifest['srchost'], manifest['srcport'], manifest['srcuser'], manifest['srcpwd'], maxPoolSize=30, read_preference=ReadPreference.SECONDARY, document_class=FasterOrderedDict) source_collection = source_client[manifest['srcdb']][manifest['srccol']] if source_client.is_mongos: raise Exception( "for performance reasons, sources must be mongod instances; %s:%d is not", manifest['srchost'], source['srcport']) dest_client = utils.mongo_connect(manifest['desthost'], manifest['destport'], manifest['destuser'], manifest['destpwd'], maxPoolSize=30, document_class=FasterOrderedDict) dest_collection = dest_client[manifest['destdb']][manifest['destcol']] # for testing copying of indices quickly if percent == 0: log.info("skipping copy because of --percent 0 parameters") state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG) return stats = Stats() stats.total_docs = int(source_collection.count(filter=manifest["query"])) if percent: # hack-ish but good enough for a testing-only feature stats.total_docs = int(stats.total_docs * (float(percent) / 100.0)) # get all _ids, which works around a mongo bug/feature that causes massive slowdowns # of long-running, large reads over time ids = [] cursor = source_collection.find(filter=manifest["query"], projection={"_id": True}, no_cursor_timeout=False) cursor.batch_size(5000) insert_pool = Pool(INSERT_POOL_SIZE) stats_greenlet = gevent.spawn(_copy_stats_worker, stats) for doc in cursor: _id = doc['_id'] if percent is not None and not utils.id_in_subset(_id, percent): continue # when we've gathered enough _ids, spawn a worker greenlet to batch copy the # documents corresponding to them ids.append(_id) if len(ids) % INSERT_SIZE == 0: outgoing_ids = ids ids = [] insert_pool.spawn(_find_and_insert_batch_worker, source_collection=source_collection, dest_collection=dest_collection, ids=outgoing_ids, stats=stats) gevent.sleep() # insert last batch of documents if len(ids) > 0: _find_and_insert_batch_worker(source_collection=source_collection, dest_collection=dest_collection, ids=ids, stats=stats) stats.log() # wait until all other outstanding inserts have finished insert_pool.join() stats_greenlet.kill() srccount = stats.total_docs destcount = dest_collection.count(filter=manifest["query"]) if srccount == destcount: log.info("COPY SUCCEED. srccount(%d) == destcount(%d)" % (srccount, destcount)) else: log.error("COPY FAILED. srccount(%d) != destcount(%d)" % (srccount, destcount)) state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)