def __init__(self, configuration): self.configuration = configuration self.db = self.configuration.internal_database() self.coll = self.configuration.internal_test_write_collection() self.primary = Mongo(self.configuration, is_primary=True) self.secondary = Mongo(self.configuration, is_primary=False)
def __init__(self, configuration, db, coll): self.configuration = configuration self.db = db self.coll = coll self.mongo_primary = Mongo(configuration, is_primary=True) self.mongo_secondary = Mongo(configuration, is_primary=False) self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll) self.previous_id = None
def __init__(self, configuration): self.configuration = configuration self.db = self.configuration.internal_database() self.coll = self.configuration.internal_test_write_collection() self.coll_expected_size = self.configuration.internal_test_write_size() self.document_size = self.configuration.internal_test_write_document_size() self.string_seed = TestWrite.generate_string_seed(50*int(max(1024,self.document_size))) self.mongo = Mongo(self.configuration, is_primary=False)
class TestRead: def __init__(self, configuration): self.configuration = configuration self.db = self.configuration.internal_database() self.coll = self.configuration.internal_test_write_collection() self.primary = Mongo(self.configuration, is_primary=True) self.secondary = Mongo(self.configuration, is_primary=False) """ Small method to loads GBs of data as fast as possible in a mongodb instance, to test the mongosync speed afterwards """ def start(self): print('Reading data from the mongosync database.') st = time.time() n = 0 cursor = self.primary.find(db=self.db, coll=self.coll, query={}, sort_field='_id', sort_order=pymongo.ASCENDING) for doc in cursor: n += 1 dt = time.time() - st print('Read ' + str(n) + ' documents in ' + str(int(dt)) + 's (' + str(int(n / dt)) + ' docs/s).')
def __init__(self, configuration, db, coll, seed_start=None, seed_end=None, total_seeds=1): self.configuration = configuration self.db = db self.coll = coll if seed_start is None or seed_end is None: # It ease the code below seed_start = {'_id': None} seed_end = {'_id': None} self.seed_start = seed_start self.seed_end = seed_end self.total_seeds = total_seeds # Total number of seeds, which can be seen as the number of instances of CollectionPart self.mongo_primary = Mongo(configuration, is_primary=True) self.mongo_secondary = Mongo(configuration, is_primary=False) self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll) self.previous_id = None
class TestWrite: def __init__(self, configuration): self.configuration = configuration self.db = self.configuration.internal_database() self.coll = self.configuration.internal_test_write_collection() self.coll_expected_size = self.configuration.internal_test_write_size() self.document_size = self.configuration.internal_test_write_document_size() self.string_seed = TestWrite.generate_string_seed(50*int(max(1024,self.document_size))) self.mongo = Mongo(self.configuration, is_primary=False) """ Small method to loads GBs of data as fast as possible in a mongodb instance, to test the mongosync speed afterwards """ def start(self): print('Inserting data in the database, we want to go up to '+str(self.coll_expected_size)+'GB.') self.mongo.drop(self.db, self.coll) current_size = 0 # To avoid insane number of docs inserted at once, we estimate how much we can have at the same time docs_per_insert = int(16 * 1024 * 1024 / self.document_size) n = 0 i = 0 st = time.time() dt = 0 while current_size < self.coll_expected_size * (1024 ** 3): docs = [{"stuff":"hello","raw":self.random_string_from_seed(self.document_size)} for i in range(docs_per_insert)] self.mongo.insert_many(self.db, self.coll, docs) n += len(docs) i += 1 if i % 10 == 0: raw_stats = self.mongo.collection_stats(self.db, self.coll) current_size = raw_stats.get('storageSize', 0) if current_size == 0: print("Warning: We got 0 bytes as storage size for the TestWrite. Did you delete the collection during the process? We'll handle it anyway.") dt = time.time() - st print('Inserted '+str(n)+' documents in '+str(int(dt))+'s. Current size is '+str(int(current_size/(1024 ** 3)))+'/'+str(self.coll_expected_size)+'GB.') print('Inserted ' + str(n) + ' documents in ' + str(int(dt)) + 's. Current size is ' + str(int(current_size / (1024 ** 3))) + '/' + str(self.coll_expected_size) + 'GB.') print('The end!') """ To avoid the compression of MongoDB to mess with our test, we create a random string used as seed for all document. This is a "slow" process so we cannot generate a new random string for each document """ @staticmethod def generate_string_seed(size): print('Generate random string seed, it might take some time...') letters = 'azertyuiopqsdfghjklmwxcvbnAZERTYUIOPQSDFGHJKLMWXCVBN0123456789' return ''.join([random.choice(letters) for i in range(size)]) """ Return a "random" list of characters from the seed string """ def random_string_from_seed(self, size): # In fact, really not random, but whatever, we'll hope MongoDB will not be able to compress the data too efficiently start = random.randint(0,len(self.string_seed) - size) return self.string_seed[start:start+size]
class Collection: def __init__(self, configuration, db, coll): self.configuration = configuration self.db = db self.coll = coll self.mongo_primary = Mongo(configuration, is_primary=True) self.mongo_secondary = Mongo(configuration, is_primary=False) self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll) self.previous_id = None """ In charge of preparing the collection to synchronize, returns the various seeds we should use. """ def prepare_sync(self): average_object_size = self.coll_stats['avgObjSize'] expected_documents = self.coll_stats['count'] # It can increase but it's not a problem, it's only used for logging # Drop / Create the destination collection self.check_collection() # Add indexes self.copy_indexes() # Get the various seeds seeds = self.list_seeds() if len(seeds) == 0: print("We should always have at least 2 seeds.") raise ValueError("Invalid seed number. Failure.") # Create and return the list of inputs necessary to create CollectionPart collection_parts = [] previous_seed = seeds[0] for seed in seeds[1:]: collection_parts.append({ 'db': self.db, 'coll': self.coll, 'seed_start': previous_seed, 'seed_end': seed, 'total_seeds': len(seeds) }) previous_seed = seed return collection_parts """ Load the seeds we want to use for the synchronisation. Return a list of tuples, each tuple represent a range, the first value is the start of it, the second value, the end of it. """ def list_seeds(self): # First, we need to be sure that we have an _id and if that's an objectid, otherwise we cannot use the same technique. # The oplog should only be tailed by one thread at a time, so we want to be sure to never create seeds for it. id_type = self.mongo_primary.id_type(self.db, self.coll) if id_type['has_id'] is False or id_type['is_object_id'] is False or (self.db == "local" and self.coll == "oplog.rs"): return [None, None] # Number of seeds we would like quantity = self.configuration.internal_maximum_seeds() if self.coll_stats['count'] <= 100*quantity: # Arbitrarily, we decide it's useless to use a lot of seeds if we only have a small number of documents return [({'_id':ObjectId('0'*24)},{'_id':ObjectId('f'*24)})] # Get various seeds seeds = self.mongo_primary.section_ids(self.db, self.coll, quantity=quantity) # TODO: In the future, if we want to be smart and allow retry of a failed sync, we should take the previous seeds # stored in the mongosync database, then make a simple query to see up to where they went # We order them to be able to return ranges. the ObjectId already allows to compare values. seeds = sorted(seeds, key=lambda seed: seed['_id']) # Always add the first and last seed seeds = [{'_id':ObjectId('0'*24)}] + seeds seeds.append({'_id':ObjectId('f'*24)}) return seeds """ Specific checks before writing to a collection """ def check_collection(self): # Stats about the optional collection if self.db in self.mongo_secondary.list_databases() and self.coll in self.mongo_secondary.list_collections(self.db): destination_stats = self.mongo_secondary.collection_stats(db=self.db, coll=self.coll) else: destination_stats = {} # For internal db, we want to remove them by default, to avoid any problem (one exception: the oplog) if len(destination_stats) != 1 and self.db == 'local': if self.coll != 'oplog.rs' and False: # Not possible to drop every db self.mongo_secondary.drop(self.db, self.coll) self.destination_stats = {} # Optionally create a capped collection, but we only do that if it didn't exist before if self.coll_stats['capped'] is True and len(destination_stats) == 0: capped_max_size = self.coll_stats.get('maxSize', -1) capped_max = self.coll_stats.get('max', -1) if self.coll_stats['ns'] == 'local.oplog.rs': # Special case, we do not necessarily want to keep the same oplog size as the other node capped_max_size = self.configuration.mongo_oplog_size() * (1024 ** 3) if capped_max_size == -1: capped_max_size = None if capped_max == -1: capped_max = None self.mongo_secondary.create_collection(self.db, self.coll, capped=True, max=capped_max, max_size=capped_max_size) """ It is better to copy indexes directly, before copying the data. That way we directly have the TTL working, but we also do not need to read all data at the end to create the indexes (if Mongo is on a NFS mount, you will lose a lot of time just for that). Even if the insert performance will be "worst", at the end, it should not matter a lot with the multi-threading copy of mongosync """ def copy_indexes(self): expected_indexes = self.mongo_primary.get_indexes(self.db, self.coll) for name in expected_indexes: index = expected_indexes[name] options = index options['keys'] = index['key'] options['name'] = name del index['key'] self.mongo_secondary.create_index(self.db, self.coll, options) def __str__(self): return 'Collection:'+self.db+'.'+self.coll def __repr__(self): return self.__str__()
class CollectionPart: """ Seeds can be None if there is no {"_id": ObjectId()} in the document database, in that case there will be only one thread in charge of copying the database """ def __init__(self, configuration, db, coll, seed_start=None, seed_end=None, total_seeds=1): self.configuration = configuration self.db = db self.coll = coll if seed_start is None or seed_end is None: # It ease the code below seed_start = {'_id': None} seed_end = {'_id': None} self.seed_start = seed_start self.seed_end = seed_end self.total_seeds = total_seeds # Total number of seeds, which can be seen as the number of instances of CollectionPart self.mongo_primary = Mongo(configuration, is_primary=True) self.mongo_secondary = Mongo(configuration, is_primary=False) self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll) self.previous_id = None """ Indicates if we should continue pulling data from the collection or not. For a BasicCollectionPart this will be easy """ def continue_fetching(self, received_quantity, expected_quantity): raise ValueError('To implement in the children.') """ Try to insert a bunch of documents, while avoiding crashes if the total size is bigger than 16MB """ def insert_subset(self, documents): try: self.mongo_secondary.insert_many(self.db, self.coll, documents) except Exception as e: print('Exception while trying to insert ' + str(len(documents)) + ' documents in ' + str(self) + ' (' + str(e) + '). Try once again, but one document after another.') # Maybe we exceeded the 16MB, so better insert every document separately for doc in documents: self.mongo_secondary.insert_many(self.db, self.coll, [doc]) """ In charge of syncing the entire part of the collection assigned to it, so every document between two given seeds. The collection must be initially created by the Collection class, this is not the job of this class. """ def sync(self): average_object_size = self.coll_stats['avgObjSize'] expected_documents = int( max(1, self.coll_stats['count'] / self.total_seeds) ) # It can increase but it's not a problem, it's only used for logging # Write limit is 16MB, so we put a security factor by only using ~12 MB limit_write = int(12 * (1024**2) / average_object_size) # For the read-limit, we can arbitrarily takes up to 16 MB * 10, to avoid using too much RAM. limit_read = int(limit_write * 10) # Raw estimation of the data size for the current collection part storage_size_part = self.coll_stats['storageSize'] / ( (1024**3) * self.total_seeds) objects_in_it = True offset = 0 st = time.time() read_time = 0 write_time = 0 i = 0 print( str(self) + ' (start-sync): ~' + str(expected_documents) + ' docs, ~' + str(int(storage_size_part)) + 'GB.') while objects_in_it: raw_stats = self.sync_section(offset, limit_read, limit_write) offset += raw_stats['quantity'] read_time += raw_stats['read_time'] write_time += raw_stats['write_time'] objects_in_it = self.continue_fetching(raw_stats['quantity'], limit_read) i += 1 if i % 50 == 0 or True: if offset >= expected_documents: # To have better logs, we check the remaining entries self.coll_stats = self.mongo_primary.collection_stats( db=self.db, coll=self.coll) expected_documents = int( max(1, self.coll_stats['count'] / self.total_seeds)) ratio = int( 1000 * offset / expected_documents) / 10 # To have the format 100.0% dt = time.time() - st average_speed = 1 expected_remaining_time = 0 if dt >= 0 and offset / dt > 0: average_speed = offset / dt expected_remaining_time = int( (expected_documents - offset) / (average_speed * 60)) # In minutes time_log = 'Read time: ' + str(int( 100 * read_time / dt)) + '%, write time: ' + str( int(100 * write_time / dt)) + '%' print( str(self) + ' (syncing): ' + str(offset) + '/' + str(expected_documents) + ' docs (' + str(ratio) + '%, ' + str(int(average_speed)) + ' docs/s). Remaining time: ~' + str(expected_remaining_time) + ' minutes. ' + time_log) dt = time.time() - st print( str(self) + ' (end-sync): ' + str(offset) + ' docs, ' + str(int(storage_size_part)) + 'GB. Time spent: ' + str(int(dt)) + 's.') # We return some stats return { 'quantity': offset, 'read_time': read_time, 'write_time': write_time } """ In charge of syncing its part of the collection (between the two given seeds). Return the number of synced objects. We are not using any iterator in this case, so the method should normally not crash if the connexion is lost at the wrong time. """ def sync_section(self, offset, limit_read, limit_write): raise ValueError('Not implemented, to override') def __str__(self): return 'CollectionPart:' + self.db + '.' + self.coll + ':[' + str( self.seed_start['_id']) + ';' + str(self.seed_end['_id']) + ']' def __repr__(self): return self.__str__()
def __init__(self, configuration): self.configuration = configuration self.primary = Mongo(configuration, is_primary=True) self.secondary = Mongo(configuration, is_primary=False)
class Core: def __init__(self, configuration): self.configuration = configuration self.primary = Mongo(configuration, is_primary=True) self.secondary = Mongo(configuration, is_primary=False) """ In charge of launching the entire synchronisation of every database. Simple version without any multi-threading. """ def start(self): print('Prepare sync of the following databases: ' + str(', '.join(self.primary.list_databases()))) # Check all CollectionParts we need to create oplog_input = None other_inputs = [] for db in self.primary.list_databases(): for coll in self.primary.list_collections(db): collection = Collection(configuration=self.configuration, db=db, coll=coll) collection_part_inputs = collection.prepare_sync() for inputs in collection_part_inputs: # We need to reserve a long-running thread for the oplog. So, we want to put as the first element of the Queue data = {'collection_part': inputs} if db == "local" and coll == "oplog.rs": oplog_input = data else: other_inputs.append(data) if oplog_input is None: raise ValueError("No oplog found...") # Fill queues used for the multi-threading qi = mp.Queue() qo = mp.Queue() qi.put(oplog_input) for inputs in other_inputs: qi.put(inputs) # Starts the Jobs. We need at least 1 thread for the oplog, and another for the other collections jobs = [] jobs_quantity = 1 + int(max(1, self.configuration.internal_threads())) common_info = {'configuration_filepath': Configuration.FILEPATH} for i in range(int(jobs_quantity)): qi.put('DONE') job = mp.Process(target=clone_collection_part, args=( qi, qo, i, common_info, )) job.start() jobs.append(job) job_done = 0 while job_done < ( jobs_quantity - 1 ): # There is one long-running thread which should never finish by itself. try: res = qo.get(timeout=3600 * 24) if res == 'DONE': job_done += 1 print('Remaining jobs: ' + str(jobs_quantity - job_done - 1)) except QueueEmpty: # We cannot put a super-huge time out, so we simply handle the exception pass except: raise # Raise all other errors print( 'End synchronisation of every database, the oplog synchronisation will continue until you stop this script. Afterwards, just remove the database from the maintenance mode.' ) """ Create the appropriate CollectionPart instance """ @staticmethod def create_collection_part(inputs): if inputs['db'] == 'local' and inputs['coll'] == 'oplog.rs': return OplogCollectionPart(**inputs) else: return BasicCollectionPart(**inputs)