예제 #1
0
    def __init__(self, configuration):
        self.configuration = configuration
        self.db = self.configuration.internal_database()
        self.coll = self.configuration.internal_test_write_collection()

        self.primary = Mongo(self.configuration, is_primary=True)
        self.secondary = Mongo(self.configuration, is_primary=False)
예제 #2
0
    def __init__(self, configuration, db, coll):
        self.configuration = configuration
        self.db = db
        self.coll = coll
        self.mongo_primary = Mongo(configuration, is_primary=True)
        self.mongo_secondary = Mongo(configuration, is_primary=False)

        self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll)
        self.previous_id = None
예제 #3
0
    def __init__(self, configuration):
        self.configuration = configuration
        self.db = self.configuration.internal_database()
        self.coll = self.configuration.internal_test_write_collection()

        self.coll_expected_size = self.configuration.internal_test_write_size()
        self.document_size = self.configuration.internal_test_write_document_size()

        self.string_seed = TestWrite.generate_string_seed(50*int(max(1024,self.document_size)))

        self.mongo = Mongo(self.configuration, is_primary=False)
예제 #4
0
class TestRead:
    def __init__(self, configuration):
        self.configuration = configuration
        self.db = self.configuration.internal_database()
        self.coll = self.configuration.internal_test_write_collection()

        self.primary = Mongo(self.configuration, is_primary=True)
        self.secondary = Mongo(self.configuration, is_primary=False)

    """
        Small method to loads GBs of data as fast as possible in a mongodb instance, to test the mongosync speed afterwards
    """

    def start(self):
        print('Reading data from the mongosync database.')
        st = time.time()
        n = 0
        cursor = self.primary.find(db=self.db,
                                   coll=self.coll,
                                   query={},
                                   sort_field='_id',
                                   sort_order=pymongo.ASCENDING)
        for doc in cursor:
            n += 1
        dt = time.time() - st
        print('Read ' + str(n) + ' documents in ' + str(int(dt)) + 's (' +
              str(int(n / dt)) + ' docs/s).')
예제 #5
0
    def __init__(self,
                 configuration,
                 db,
                 coll,
                 seed_start=None,
                 seed_end=None,
                 total_seeds=1):
        self.configuration = configuration
        self.db = db
        self.coll = coll
        if seed_start is None or seed_end is None:  # It ease the code below
            seed_start = {'_id': None}
            seed_end = {'_id': None}
        self.seed_start = seed_start
        self.seed_end = seed_end
        self.total_seeds = total_seeds  # Total number of seeds, which can be seen as the number of instances of CollectionPart
        self.mongo_primary = Mongo(configuration, is_primary=True)
        self.mongo_secondary = Mongo(configuration, is_primary=False)

        self.coll_stats = self.mongo_primary.collection_stats(db=self.db,
                                                              coll=self.coll)
        self.previous_id = None
예제 #6
0
class TestWrite:

    def __init__(self, configuration):
        self.configuration = configuration
        self.db = self.configuration.internal_database()
        self.coll = self.configuration.internal_test_write_collection()

        self.coll_expected_size = self.configuration.internal_test_write_size()
        self.document_size = self.configuration.internal_test_write_document_size()

        self.string_seed = TestWrite.generate_string_seed(50*int(max(1024,self.document_size)))

        self.mongo = Mongo(self.configuration, is_primary=False)

    """
        Small method to loads GBs of data as fast as possible in a mongodb instance, to test the mongosync speed afterwards
    """
    def start(self):
        print('Inserting data in the database, we want to go up to '+str(self.coll_expected_size)+'GB.')
        self.mongo.drop(self.db, self.coll)
        current_size = 0

        # To avoid insane number of docs inserted at once, we estimate how much we can have at the same time
        docs_per_insert = int(16 * 1024 * 1024 / self.document_size)
        n = 0
        i = 0
        st = time.time()
        dt = 0
        while current_size < self.coll_expected_size * (1024 ** 3):
            docs = [{"stuff":"hello","raw":self.random_string_from_seed(self.document_size)} for i in range(docs_per_insert)]
            self.mongo.insert_many(self.db, self.coll, docs)
            n += len(docs)
            i += 1

            if i % 10 == 0:
                raw_stats = self.mongo.collection_stats(self.db, self.coll)
                current_size = raw_stats.get('storageSize', 0)
                if current_size == 0:
                    print("Warning: We got 0 bytes as storage size for the TestWrite. Did you delete the collection during the process? We'll handle it anyway.")

                dt = time.time() - st
                print('Inserted '+str(n)+' documents in '+str(int(dt))+'s. Current size is '+str(int(current_size/(1024 ** 3)))+'/'+str(self.coll_expected_size)+'GB.')

        print('Inserted ' + str(n) + ' documents in ' + str(int(dt)) + 's. Current size is ' + str(int(current_size / (1024 ** 3))) + '/' + str(self.coll_expected_size) + 'GB.')
        print('The end!')

    """
        To avoid the compression of MongoDB to mess with our test, we create a random string used as seed for all document.
        This is a "slow" process so we cannot generate a new random string for each document  
    """
    @staticmethod
    def generate_string_seed(size):
        print('Generate random string seed, it might take some time...')
        letters = 'azertyuiopqsdfghjklmwxcvbnAZERTYUIOPQSDFGHJKLMWXCVBN0123456789'
        return ''.join([random.choice(letters) for i in range(size)])

    """
        Return a "random" list of characters from the seed string 
    """
    def random_string_from_seed(self, size):
        # In fact, really not random, but whatever, we'll hope MongoDB will not be able to compress the data too efficiently
        start = random.randint(0,len(self.string_seed) - size)
        return self.string_seed[start:start+size]
예제 #7
0
class Collection:
    def __init__(self, configuration, db, coll):
        self.configuration = configuration
        self.db = db
        self.coll = coll
        self.mongo_primary = Mongo(configuration, is_primary=True)
        self.mongo_secondary = Mongo(configuration, is_primary=False)

        self.coll_stats = self.mongo_primary.collection_stats(db=self.db, coll=self.coll)
        self.previous_id = None

    """
        In charge of preparing the collection to synchronize, returns the various seeds we should use. 
    """
    def prepare_sync(self):
        average_object_size = self.coll_stats['avgObjSize']
        expected_documents  = self.coll_stats['count'] # It can increase but it's not a problem, it's only used for logging

        # Drop / Create the destination collection
        self.check_collection()

        # Add indexes
        self.copy_indexes()

        # Get the various seeds
        seeds = self.list_seeds()
        if len(seeds) == 0:
            print("We should always have at least 2 seeds.")
            raise ValueError("Invalid seed number. Failure.")

        # Create and return the list of inputs necessary to create CollectionPart
        collection_parts = []
        previous_seed = seeds[0]
        for seed in seeds[1:]:
            collection_parts.append({
                'db': self.db,
                'coll': self.coll,
                'seed_start': previous_seed,
                'seed_end': seed,
                'total_seeds': len(seeds)
            })
            previous_seed = seed
        return collection_parts



    """
        Load the seeds we want to use for the synchronisation. Return a list of tuples, each tuple represent a range,
        the first value is the start of it, the second value, the end of it.
    """
    def list_seeds(self):
        # First, we need to be sure that we have an _id and if that's an objectid, otherwise we cannot use the same technique.
        # The oplog should only be tailed by one thread at a time, so we want to be sure to never create seeds for it.
        id_type = self.mongo_primary.id_type(self.db, self.coll)
        if id_type['has_id'] is False or id_type['is_object_id'] is False or (self.db == "local" and self.coll == "oplog.rs"):
            return [None, None]


        # Number of seeds we would like
        quantity = self.configuration.internal_maximum_seeds()
        if self.coll_stats['count'] <= 100*quantity: # Arbitrarily, we decide it's useless to use a lot of seeds if we only have a small number of documents
            return [({'_id':ObjectId('0'*24)},{'_id':ObjectId('f'*24)})]

        # Get various seeds
        seeds = self.mongo_primary.section_ids(self.db, self.coll, quantity=quantity)

        # TODO: In the future, if we want to be smart and allow retry of a failed sync, we should take the previous seeds
        # stored in the mongosync database, then make a simple query to see up to where they went

        # We order them to be able to return ranges. the ObjectId already allows to compare values.
        seeds = sorted(seeds, key=lambda seed: seed['_id'])

        # Always add the first and last seed
        seeds = [{'_id':ObjectId('0'*24)}] + seeds
        seeds.append({'_id':ObjectId('f'*24)})

        return seeds

    """
        Specific checks before writing to a collection
    """
    def check_collection(self):
        # Stats about the optional collection
        if self.db in self.mongo_secondary.list_databases() and self.coll in self.mongo_secondary.list_collections(self.db):
            destination_stats = self.mongo_secondary.collection_stats(db=self.db, coll=self.coll)
        else:
            destination_stats = {}

        # For internal db, we want to remove them by default, to avoid any problem (one exception: the oplog)
        if len(destination_stats) != 1 and self.db == 'local':
            if self.coll != 'oplog.rs' and False:
                # Not possible to drop every db
                self.mongo_secondary.drop(self.db, self.coll)
                self.destination_stats = {}

        # Optionally create a capped collection, but we only do that if it didn't exist before
        if self.coll_stats['capped'] is True and len(destination_stats) == 0:
            capped_max_size = self.coll_stats.get('maxSize', -1)
            capped_max = self.coll_stats.get('max', -1)
            if self.coll_stats['ns'] == 'local.oplog.rs':
                # Special case, we do not necessarily want to keep the same oplog size as the other node
                capped_max_size = self.configuration.mongo_oplog_size() * (1024 ** 3)

            if capped_max_size == -1:
                capped_max_size = None
            if capped_max == -1:
                capped_max = None

            self.mongo_secondary.create_collection(self.db, self.coll, capped=True, max=capped_max, max_size=capped_max_size)

    """
        It is better to copy indexes directly, before copying the data. That way we directly have the TTL working, but we also
        do not need to read all data at the end to create the indexes (if Mongo is on a NFS mount, you will lose a lot of
        time just for that).
        Even if the insert performance will be "worst", at the end, it should not matter a lot with the multi-threading copy
        of mongosync 
    """
    def copy_indexes(self):
        expected_indexes = self.mongo_primary.get_indexes(self.db, self.coll)
        for name in expected_indexes:
            index = expected_indexes[name]
            options = index
            options['keys'] = index['key']
            options['name'] = name
            del index['key']

            self.mongo_secondary.create_index(self.db, self.coll, options)


    def __str__(self):
        return 'Collection:'+self.db+'.'+self.coll

    def __repr__(self):
        return self.__str__()
예제 #8
0
class CollectionPart:
    """
        Seeds can be None if there is no {"_id": ObjectId()} in the document database, in that case there will be only one
        thread in charge of copying the database
    """
    def __init__(self,
                 configuration,
                 db,
                 coll,
                 seed_start=None,
                 seed_end=None,
                 total_seeds=1):
        self.configuration = configuration
        self.db = db
        self.coll = coll
        if seed_start is None or seed_end is None:  # It ease the code below
            seed_start = {'_id': None}
            seed_end = {'_id': None}
        self.seed_start = seed_start
        self.seed_end = seed_end
        self.total_seeds = total_seeds  # Total number of seeds, which can be seen as the number of instances of CollectionPart
        self.mongo_primary = Mongo(configuration, is_primary=True)
        self.mongo_secondary = Mongo(configuration, is_primary=False)

        self.coll_stats = self.mongo_primary.collection_stats(db=self.db,
                                                              coll=self.coll)
        self.previous_id = None

    """
        Indicates if we should continue pulling data from the collection or not. For a BasicCollectionPart this will be easy
    """

    def continue_fetching(self, received_quantity, expected_quantity):
        raise ValueError('To implement in the children.')

    """
        Try to insert a bunch of documents, while avoiding crashes if the total size is bigger than 16MB
    """

    def insert_subset(self, documents):
        try:
            self.mongo_secondary.insert_many(self.db, self.coll, documents)
        except Exception as e:
            print('Exception while trying to insert ' + str(len(documents)) +
                  ' documents in ' + str(self) + ' (' + str(e) +
                  '). Try once again, but one document after another.')
            # Maybe we exceeded the 16MB, so better insert every document separately
            for doc in documents:
                self.mongo_secondary.insert_many(self.db, self.coll, [doc])

    """
        In charge of syncing the entire part of the collection assigned to it, so every document between two given
        seeds. The collection must be initially created by the Collection class, this is not the job of this class.
    """

    def sync(self):
        average_object_size = self.coll_stats['avgObjSize']
        expected_documents = int(
            max(1, self.coll_stats['count'] / self.total_seeds)
        )  # It can increase but it's not a problem, it's only used for logging

        # Write limit is 16MB, so we put a security factor by only using ~12 MB
        limit_write = int(12 * (1024**2) / average_object_size)
        # For the read-limit, we can arbitrarily takes up to 16 MB * 10, to avoid using too much RAM.
        limit_read = int(limit_write * 10)

        # Raw estimation of the data size for the current collection part
        storage_size_part = self.coll_stats['storageSize'] / (
            (1024**3) * self.total_seeds)

        objects_in_it = True
        offset = 0
        st = time.time()
        read_time = 0
        write_time = 0
        i = 0
        print(
            str(self) + ' (start-sync): ~' + str(expected_documents) +
            ' docs, ~' + str(int(storage_size_part)) + 'GB.')
        while objects_in_it:
            raw_stats = self.sync_section(offset, limit_read, limit_write)
            offset += raw_stats['quantity']
            read_time += raw_stats['read_time']
            write_time += raw_stats['write_time']

            objects_in_it = self.continue_fetching(raw_stats['quantity'],
                                                   limit_read)

            i += 1
            if i % 50 == 0 or True:
                if offset >= expected_documents:
                    # To have better logs, we check the remaining entries
                    self.coll_stats = self.mongo_primary.collection_stats(
                        db=self.db, coll=self.coll)
                    expected_documents = int(
                        max(1, self.coll_stats['count'] / self.total_seeds))

                ratio = int(
                    1000 * offset /
                    expected_documents) / 10  # To have the format 100.0%
                dt = time.time() - st
                average_speed = 1
                expected_remaining_time = 0
                if dt >= 0 and offset / dt > 0:
                    average_speed = offset / dt
                    expected_remaining_time = int(
                        (expected_documents - offset) /
                        (average_speed * 60))  # In minutes

                time_log = 'Read time: ' + str(int(
                    100 * read_time / dt)) + '%, write time: ' + str(
                        int(100 * write_time / dt)) + '%'
                print(
                    str(self) + ' (syncing): ' + str(offset) + '/' +
                    str(expected_documents) + ' docs (' + str(ratio) + '%, ' +
                    str(int(average_speed)) + ' docs/s). Remaining time: ~' +
                    str(expected_remaining_time) + ' minutes. ' + time_log)

        dt = time.time() - st
        print(
            str(self) + ' (end-sync): ' + str(offset) + ' docs, ' +
            str(int(storage_size_part)) + 'GB. Time spent: ' + str(int(dt)) +
            's.')

        # We return some stats
        return {
            'quantity': offset,
            'read_time': read_time,
            'write_time': write_time
        }

    """
        In charge of syncing its part of the collection (between the two given seeds). Return the number of synced objects.
        We are not using any iterator in this case, so the method should normally not crash if the connexion is lost
        at the wrong time.
    """

    def sync_section(self, offset, limit_read, limit_write):
        raise ValueError('Not implemented, to override')

    def __str__(self):
        return 'CollectionPart:' + self.db + '.' + self.coll + ':[' + str(
            self.seed_start['_id']) + ';' + str(self.seed_end['_id']) + ']'

    def __repr__(self):
        return self.__str__()
예제 #9
0
 def __init__(self, configuration):
     self.configuration = configuration
     self.primary = Mongo(configuration, is_primary=True)
     self.secondary = Mongo(configuration, is_primary=False)
예제 #10
0
class Core:
    def __init__(self, configuration):
        self.configuration = configuration
        self.primary = Mongo(configuration, is_primary=True)
        self.secondary = Mongo(configuration, is_primary=False)

    """
        In charge of launching the entire synchronisation of every database. Simple version without any multi-threading.
    """

    def start(self):
        print('Prepare sync of the following databases: ' +
              str(', '.join(self.primary.list_databases())))

        # Check all CollectionParts we need to create
        oplog_input = None
        other_inputs = []
        for db in self.primary.list_databases():
            for coll in self.primary.list_collections(db):
                collection = Collection(configuration=self.configuration,
                                        db=db,
                                        coll=coll)
                collection_part_inputs = collection.prepare_sync()

                for inputs in collection_part_inputs:
                    # We need to reserve a long-running thread for the oplog. So, we want to put as the first element of the Queue
                    data = {'collection_part': inputs}
                    if db == "local" and coll == "oplog.rs":
                        oplog_input = data
                    else:
                        other_inputs.append(data)

        if oplog_input is None:
            raise ValueError("No oplog found...")

        # Fill queues used for the multi-threading
        qi = mp.Queue()
        qo = mp.Queue()

        qi.put(oplog_input)
        for inputs in other_inputs:
            qi.put(inputs)

        # Starts the Jobs. We need at least 1 thread for the oplog, and another for the other collections
        jobs = []
        jobs_quantity = 1 + int(max(1, self.configuration.internal_threads()))
        common_info = {'configuration_filepath': Configuration.FILEPATH}
        for i in range(int(jobs_quantity)):
            qi.put('DONE')
            job = mp.Process(target=clone_collection_part,
                             args=(
                                 qi,
                                 qo,
                                 i,
                                 common_info,
                             ))
            job.start()
            jobs.append(job)

        job_done = 0
        while job_done < (
                jobs_quantity - 1
        ):  # There is one long-running thread which should never finish by itself.
            try:
                res = qo.get(timeout=3600 * 24)
                if res == 'DONE':
                    job_done += 1
                    print('Remaining jobs: ' +
                          str(jobs_quantity - job_done - 1))
            except QueueEmpty:  # We cannot put a super-huge time out, so we simply handle the exception
                pass
            except:
                raise  # Raise all other errors

        print(
            'End synchronisation of every database, the oplog synchronisation will continue until you stop this script. Afterwards, just remove the database from the maintenance mode.'
        )

    """
        Create the appropriate CollectionPart instance
    """

    @staticmethod
    def create_collection_part(inputs):
        if inputs['db'] == 'local' and inputs['coll'] == 'oplog.rs':
            return OplogCollectionPart(**inputs)
        else:
            return BasicCollectionPart(**inputs)