예제 #1
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and True if the cursor is empty.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                self.update_checkpoint(timestamp)
                if timestamp is None:
                    return None, True
            else:
                # Collection dump disabled:
                # Return cursor to beginning of oplog but do not set the
                # checkpoint. The checkpoint will be set after an operation
                # has been applied.
                cursor = self.get_oplog_cursor()
                return cursor, self._cursor_empty(cursor)

        cursor = self.get_oplog_cursor(timestamp)
        cursor_empty = self._cursor_empty(cursor)

        if cursor_empty:
            # rollback, update checkpoint, and retry
            LOG.debug("OplogThread: Initiating rollback from "
                      "get_oplog_cursor")
            self.update_checkpoint(self.rollback())
            return self.init_cursor()

        first_oplog_entry = next(cursor)

        checkpoint_ts_long = util.bson_ts_to_long(timestamp)
        if not self.continue_on_error:
            oldest_ts_long = util.bson_ts_to_long(
                self.get_oldest_oplog_timestamp())
            if checkpoint_ts_long < oldest_ts_long:
                # We've fallen behind, the checkpoint has fallen off the oplog
                return None, True

        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry["ts"])
        if cursor_ts_long > checkpoint_ts_long:
            # The checkpoint is not present in this oplog and the oplog
            # did not rollover. This means that we connected to a new
            # primary which did not replicate the checkpoint and which has
            # new changes in its oplog for us to process.
            # rollback, update checkpoint, and retry
            LOG.debug("OplogThread: Initiating rollback from "
                      "get_oplog_cursor: new oplog entries found but "
                      "checkpoint is not present")
            self.update_checkpoint(self.rollback())
            return self.init_cursor()

        # first entry has been consumed
        return cursor, cursor_empty
 def test_upgrade_oplog_progress(self):
     first_oplog_ts1 = self.opman1.oplog.find_one()['ts']
     first_oplog_ts2 = self.opman2.oplog.find_one()['ts']
     # Old format oplog progress file:
     progress = {
         str(self.opman1.oplog): bson_ts_to_long(first_oplog_ts1),
         str(self.opman2.oplog): bson_ts_to_long(first_oplog_ts2)
     }
     # Set up oplog managers to use the old format.
     oplog_progress = LockingDict()
     oplog_progress.dict = progress
     self.opman1.oplog_progress = oplog_progress
     self.opman2.oplog_progress = oplog_progress
     # Cause the oplog managers to update their checkpoints.
     self.opman1.update_checkpoint(first_oplog_ts1)
     self.opman2.update_checkpoint(first_oplog_ts2)
     # New format should be in place now.
     new_format = {
         self.opman1.replset_name: first_oplog_ts1,
         self.opman2.replset_name: first_oplog_ts2
     }
     self.assertEqual(
         new_format,
         self.opman1.oplog_progress.get_dict()
     )
     self.assertEqual(
         new_format,
         self.opman2.oplog_progress.get_dict()
     )
 def test_upgrade_oplog_progress(self):
     first_oplog_ts1 = self.opman1.oplog.find_one()['ts']
     first_oplog_ts2 = self.opman2.oplog.find_one()['ts']
     # Old format oplog progress file:
     progress = {
         str(self.opman1.oplog): bson_ts_to_long(first_oplog_ts1),
         str(self.opman2.oplog): bson_ts_to_long(first_oplog_ts2)
     }
     # Set up oplog managers to use the old format.
     oplog_progress = LockingDict()
     oplog_progress.dict = progress
     self.opman1.oplog_progress = oplog_progress
     self.opman2.oplog_progress = oplog_progress
     # Cause the oplog managers to update their checkpoints.
     self.opman1.update_checkpoint(first_oplog_ts1)
     self.opman2.update_checkpoint(first_oplog_ts2)
     # New format should be in place now.
     new_format = {
         self.opman1.replset_name: first_oplog_ts1,
         self.opman2.replset_name: first_oplog_ts2
     }
     self.assertEqual(
         new_format,
         self.opman1.oplog_progress.get_dict()
     )
     self.assertEqual(
         new_format,
         self.opman2.oplog_progress.get_dict()
     )
예제 #4
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and True if the cursor is empty.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                self.update_checkpoint(timestamp)
                if timestamp is None:
                    return None, True
            else:
                # Collection dump disabled:
                # Return cursor to beginning of oplog but do not set the
                # checkpoint. The checkpoint will be set after an operation
                # has been applied.
                cursor = self.get_oplog_cursor()
                return cursor, self._cursor_empty(cursor)

        cursor = self.get_oplog_cursor(timestamp)
        cursor_empty = self._cursor_empty(cursor)

        if cursor_empty:
            # rollback, update checkpoint, and retry
            LOG.debug("OplogThread: Initiating rollback from "
                      "get_oplog_cursor")
            self.update_checkpoint(self.rollback())
            return self.init_cursor()

        first_oplog_entry = next(cursor)

        oldest_ts_long = util.bson_ts_to_long(
            self.get_oldest_oplog_timestamp())
        checkpoint_ts_long = util.bson_ts_to_long(timestamp)
        if checkpoint_ts_long < oldest_ts_long:
            # We've fallen behind, the checkpoint has fallen off the oplog
            return None, True

        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry["ts"])
        if cursor_ts_long > checkpoint_ts_long:
            # The checkpoint is not present in this oplog and the oplog
            # did not rollover. This means that we connected to a new
            # primary which did not replicate the checkpoint and which has
            # new changes in its oplog for us to process.
            # rollback, update checkpoint, and retry
            LOG.debug("OplogThread: Initiating rollback from "
                      "get_oplog_cursor: new oplog entries found but "
                      "checkpoint is not present")
            self.update_checkpoint(self.rollback())
            return self.init_cursor()

        # first entry has been consumed
        return cursor, cursor_empty
예제 #5
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """

        timestamp = self.checkpoint
        if self.checkpoint is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                return cursor, retry_until_ok(cursor.count)

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_len = retry_until_ok(cursor.count)

            if cursor_len == 0:
                # rollback, update checkpoint, and retry
                logging.debug("OplogThread: Initiating rollback from "
                              "get_oplog_cursor")
                self.checkpoint = self.rollback()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = next(cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the cursor.count() call and now
                time.sleep(1)
                continue

            # first entry should be last oplog entry processed
            cursor_ts_long = util.bson_ts_to_long(
                first_oplog_entry.get("ts"))
            given_ts_long = util.bson_ts_to_long(timestamp)
            if cursor_ts_long > given_ts_long:
                # first entry in oplog is beyond timestamp
                # we've fallen behind
                return None, 0

            # first entry has been consumed
            return cursor, cursor_len - 1

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
예제 #6
0
    def write_oplog_progress(self):
        """ Writes oplog progress to file provided by user
        """

        if self.oplog_checkpoint is None:
            return None

        with self.oplog_progress as oplog_prog:
            oplog_dict = oplog_prog.get_dict()
        items = [[name, util.bson_ts_to_long(oplog_dict[name])]
                 for name in oplog_dict]
        if not items:
            return

        # write to temp file
        backup_file = self.oplog_checkpoint + '.backup'
        os.rename(self.oplog_checkpoint, backup_file)

        # for each of the threads write to file
        with open(self.oplog_checkpoint, 'w') as dest:
            if len(items) == 1:
                # Write 1-dimensional array, as in previous versions.
                json_str = json.dumps(items[0])
            else:
                # Write a 2d array to support sharded clusters.
                json_str = json.dumps(items)
            try:
                dest.write(json_str)
            except IOError:
                # Basically wipe the file, copy from backup
                dest.truncate()
                with open(backup_file, 'r') as backup:
                    shutil.copyfile(backup, dest)

        os.remove(backup_file)
예제 #7
0
    def write_oplog_progress(self):
        """ Writes oplog progress to file provided by user
        """

        if self.oplog_checkpoint is None:
            return None

        with self.oplog_progress as oplog_prog:
            oplog_dict = oplog_prog.get_dict()
        items = [[name, util.bson_ts_to_long(oplog_dict[name])]
                 for name in oplog_dict]
        if not items:
            return

        # write to temp file
        backup_file = self.oplog_checkpoint + '.backup'
        os.rename(self.oplog_checkpoint, backup_file)

        # for each of the threads write to file
        with open(self.oplog_checkpoint, 'w') as dest:
            if len(items) == 1:
                # Write 1-dimensional array, as in previous versions.
                json_str = json.dumps(items[0])
            else:
                # Write a 2d array to support sharded clusters.
                json_str = json.dumps(items)
            try:
                dest.write(json_str)
            except IOError:
                # Basically wipe the file, copy from backup
                dest.truncate()
                with open(backup_file, 'r') as backup:
                    shutil.copyfile(backup, dest)

        os.remove(backup_file)
예제 #8
0
    def write_oplog_progress(self):
        """ Writes oplog progress to file provided by user
        """

        if self.oplog_checkpoint is None:
            return None

        # write to temp file
        backup_file = self.oplog_checkpoint + '.backup'
        os.rename(self.oplog_checkpoint, backup_file)

        # for each of the threads write to file
        with open(self.oplog_checkpoint, 'w') as dest:
            with self.oplog_progress as oplog_prog:

                oplog_dict = oplog_prog.get_dict()
                for oplog, time_stamp in oplog_dict.items():
                    oplog_str = str(oplog)
                    timestamp = util.bson_ts_to_long(time_stamp)
                    json_str = json.dumps([oplog_str, timestamp])
                    try:
                        dest.write(json_str)
                    except IOError:
                        # Basically wipe the file, copy from backup
                        dest.truncate()
                        with open(backup_file, 'r') as backup:
                            shutil.copyfile(backup, dest)
                        break

        os.remove(self.oplog_checkpoint + '.backup')
예제 #9
0
    def write_oplog_progress(self):
        """ Writes oplog progress to file provided by user
        """

        if self.oplog_checkpoint is None:
            return None

        # write to temp file
        backup_file = self.oplog_checkpoint + '.backup'
        os.rename(self.oplog_checkpoint, backup_file)

        # for each of the threads write to file
        with open(self.oplog_checkpoint, 'w') as dest:
            with self.oplog_progress as oplog_prog:

                oplog_dict = oplog_prog.get_dict()
                for oplog, time_stamp in oplog_dict.items():
                    oplog_str = str(oplog)
                    timestamp = util.bson_ts_to_long(time_stamp)
                    json_str = json.dumps([oplog_str, timestamp])
                    try:
                        dest.write(json_str)
                    except IOError:
                        # Basically wipe the file, copy from backup
                        dest.truncate()
                        with open(backup_file, 'r') as backup:
                            shutil.copyfile(backup, dest)
                        break

        os.remove(self.oplog_checkpoint + '.backup')
예제 #10
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set

        #no namespaces specified
        if not self.namespace_set:
            db_list = self.main_connection.database_names()
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = self.main_connection[database].collection_names()
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = str(database) + "." + str(coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump():
            for namespace in dump_set:
                logging.info("dumping collection %s" % namespace)
                database, coll = namespace.split('.', 1)
                target_coll = self.main_connection[database][coll]
                cursor = util.retry_until_ok(target_coll.find)
                for doc in cursor:
                    if not self.running:
                        raise StopIteration
                    doc["ns"] = namespace
                    doc["_ts"] = long_ts
                    yield doc

        try:
            # Bulk upsert if possible
            if self.can_bulk:
                self.doc_manager.bulk_upsert(docs_to_dump())
            else:
                for doc in docs_to_dump():
                    try:
                        self.doc_manager.upsert(self.filter_fields(doc))
                    except errors.OperationFailed:
                        logging.error("Unable to insert %s" % doc)
        except (pymongo.errors.AutoReconnect,
                pymongo.errors.OperationFailure):
            err_msg = "OplogManager: Failed during dump collection"
            effect = "cannot recover!"
            logging.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
예제 #11
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return self.get_oplog_cursor()

        self.checkpoint = timestamp
        self.update_checkpoint()

        cursor, cursor_len = self.get_oplog_cursor(timestamp)

        if cursor_len == 0:
            # rollback, update checkpoint, and retry
            logging.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
            self.checkpoint = self.rollback()
            self.update_checkpoint()
            return self.init_cursor()

        # first entry should be last oplog entry processed
        first_oplog_entry = retry_until_ok(lambda: cursor[0])
        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry.get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None, 0

        retry_until_ok(next, cursor)
        return cursor, cursor_len - 1
예제 #12
0
    def test_bson_ts_to_long(self):
        """Test bson_ts_to_long and long_to_bson_ts
        """

        tstamp = timestamp.Timestamp(0x12345678, 0x90abcdef)

        self.assertEqual(0x1234567890abcdef, bson_ts_to_long(tstamp))
        self.assertEqual(long_to_bson_ts(0x1234567890abcdef), tstamp)
예제 #13
0
    def test_bson_ts_to_long(self):
        """Test bson_ts_to_long and long_to_bson_ts
        """

        tstamp = timestamp.Timestamp(0x12345678, 0x90ABCDEF)

        self.assertEqual(0x1234567890ABCDEF, bson_ts_to_long(tstamp))
        self.assertEqual(long_to_bson_ts(0x1234567890ABCDEF), tstamp)
예제 #14
0
    def get_oplog_cursor(self, timestamp):
        """Move cursor to the proper place in the oplog.
        """

        if timestamp is None:
            return None

        cursor, cursor_len = None, 0
        while (True):
            try:
                cursor = self.oplog.find({'ts': {
                    '$gte': timestamp
                }},
                                         tailable=True,
                                         await_data=True)
                # Applying 8 as the mask to the cursor enables OplogReplay
                cursor.add_option(8)
                cursor_len = cursor.count()
                break
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                pass
        if cursor_len == 0:
            #rollback, we are past the last element in the oplog
            timestamp = self.rollback()

            logging.info('Finished rollback')
            return self.get_oplog_cursor(timestamp)
        cursor_ts_long = util.bson_ts_to_long(cursor[0].get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None
        elif cursor_len == 1:  # means we are the end of the oplog
            self.checkpoint = timestamp
            #to commit new TS after rollbacks

            return cursor
        elif cursor_len > 1:
            doc = next(cursor)
            if timestamp == doc['ts']:
                return cursor
            else:  # error condition
                logging.error('%s Bad timestamp in config file' % self.oplog)
                return None
예제 #15
0
    def get_oplog_cursor(self, timestamp):
        """Move cursor to the proper place in the oplog.
        """

        if timestamp is None:
            return None

        cursor, cursor_len = None, 0
        while (True):
            try:
                cursor = self.oplog.find({'ts': {'$gte': timestamp}},
                                         tailable=True, await_data=True)
                # Applying 8 as the mask to the cursor enables OplogReplay
                cursor.add_option(8)
                cursor_len = cursor.count()
                break
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                pass
        if cursor_len == 0:
            #rollback, we are past the last element in the oplog
            timestamp = self.rollback()

            logging.info('Finished rollback')
            return self.get_oplog_cursor(timestamp)
        cursor_ts_long = util.bson_ts_to_long(cursor[0].get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None
        elif cursor_len == 1:     # means we are the end of the oplog
            self.checkpoint = timestamp
            #to commit new TS after rollbacks

            return cursor
        elif cursor_len > 1:
            doc = next(cursor)
            if timestamp == doc['ts']:
                return cursor
            else:               # error condition
                logging.error('%s Bad timestamp in config file' % self.oplog)
                return None
예제 #16
0
    def dump_collection(self):
        """Dumps collection into the target system.
        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        timestamp = retry_until_ok(self.get_oldest_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)
        return timestamp
예제 #17
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set

        #no namespaces specified
        if not self.namespace_set:
            db_list = self.main_connection.database_names()
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = self.main_connection[database].collection_names()
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = str(database) + "." + str(coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        for namespace in dump_set:
            database, coll = namespace.split('.', 1)
            target_coll = self.main_connection[database][coll]
            cursor = util.retry_until_ok(target_coll.find)
            long_ts = util.bson_ts_to_long(timestamp)

            try:
                for doc in cursor:
                    # Could spend a long time in this loop
                    if not self.running:
                        # Return None so we don't save our progress
                        return None
                    doc['ns'] = namespace
                    doc['_ts'] = long_ts
                    try:
                        self.doc_manager.upsert(doc)
                    except errors.OperationFailed:
                        logging.error("Unable to insert %s" % (doc))
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):

                err_msg = "OplogManager: Failed during dump collection"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                return

        return timestamp
예제 #18
0
    def setUp(self):
        """Initializes all required variables and set up arango connection, create test
            database and collection before every test method
        """
        self.url = "localhost:8529"
        self.timestamp = bson_ts_to_long(Timestamp(int(time.time()), 1))
        self.arango_doc_manager_obj = DocManager(self.url)
        self.arango_connection = self.arango_doc_manager_obj.arango

        self.database_name = "test_db"
        self.collection_name = "test_collection"
        self.namespace = self.database_name + '.' + self.collection_name

        self.db = self.arango_connection.create_database(self.database_name)
        self.collection = self.db.create_collection(self.collection_name)
예제 #19
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        self.doc_manager.commit()
        last_inserted_doc = self.doc_manager.get_last_doc()

        if last_inserted_doc is None:
            return None

        target_ts = util.long_to_bson_ts(last_inserted_doc['_ts'])
        last_oplog_entry = self.oplog.find_one({'ts': {'$lte': target_ts}},
                                               sort=[('$natural',
                                               pymongo.DESCENDING)])
        if last_oplog_entry is None:
            return None

        rollback_cutoff_ts = last_oplog_entry['ts']
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        end_ts = last_inserted_doc['_ts']

        rollback_set = {}   # this is a dictionary of ns:list of docs
        for doc in self.doc_manager.search(start_ts, end_ts):
            if doc['ns'] in rollback_set:
                rollback_set[doc['ns']].append(doc)
            else:
                rollback_set[doc['ns']] = [doc]

        for namespace, doc_list in rollback_set.items():
            database, coll = namespace.split('.', 1)
            obj_id = bson.objectid.ObjectId
            bson_obj_id_list = [obj_id(doc['_id']) for doc in doc_list]

            to_update = util.retry_until_ok(
                self.main_connection[database][coll].find,
                {'_id': {'$in': bson_obj_id_list}})
            #doc list are docs in  target system, to_update are docs in mongo
            doc_hash = {}  # hash by _id
            for doc in doc_list:
                doc_hash[bson.objectid.ObjectId(doc['_id'])] = doc

            to_index = []
            count = 0
            while True:
                try:
                    for doc in to_update:
                        if doc['_id'] in doc_hash:
                            del doc_hash[doc['_id']]
                            to_index.append(doc)
                    break
                except (pymongo.errors.OperationFailure,
                        pymongo.errors.AutoReconnect):
                    count += 1
                    if count > 60:
                        sys.exit(1)
                    time.sleep(1)

            #delete the inconsistent documents
            for doc in doc_hash.values():
                self.doc_manager.remove(doc)

            #insert the ones from mongo
            for doc in to_index:
                doc['_ts'] = util.bson_ts_to_long(rollback_cutoff_ts)
                doc['ns'] = namespace
                try:
                    self.doc_manager.upsert(doc)
                except errors.OperationFailed:
                    logging.error("Unable to insert %s" % (doc))

        return rollback_cutoff_ts
예제 #20
0
    def test_rollback(self):
        """Test rollback in oplog_manager. Assertion failure if it doesn't pass
            We force a rollback by inserting a doc, killing the primary,
            inserting another doc, killing the new primary, and then restarting
            both.
        """
        os.system('rm config.txt; touch config.txt')
        test_oplog, primary_conn, mongos, solr = self.get_new_oplog()

        if not start_cluster():
            self.fail('Cluster could not be started successfully!')

        solr = DocManager()
        test_oplog.doc_manager = solr
        solr._delete()          # equivalent to solr.delete(q='*: *')

        mongos['test']['test'].remove({})
        mongos['test']['test'].insert( 
             {'_id': ObjectId('4ff74db3f646462b38000001'),
             'name': 'paulie'},
             safe=True
             )
        while (mongos['test']['test'].find().count() != 1):
            time.sleep(1)
        cutoff_ts = test_oplog.get_last_oplog_timestamp()

        first_doc = {'name': 'paulie', '_ts': bson_ts_to_long(cutoff_ts),
                     'ns': 'test.test',
                     '_id':  ObjectId('4ff74db3f646462b38000001')}

        #try kill one, try restarting
        kill_mongo_proc(primary_conn.host, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin = new_primary_conn['admin']
        while admin.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                mongos['test']['test'].insert({
                    '_id': ObjectId('4ff74db3f646462b38000002'),
                    'name': 'paul'}, 
                    safe=True)
                break
            except OperationFailure:
                count += 1
                if count > 60:
                    self.fail('Call to insert doc failed too many times')
                time.sleep(1)
                continue
        while (mongos['test']['test'].find().count() != 2):
            time.sleep(1)
        kill_mongo_proc(primary_conn.host, PORTS_ONE['SECONDARY'])
        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)

        #wait for master to be established
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        #wait for secondary to be established
        admin = new_primary_conn['admin']
        while admin.command("replSetGetStatus")['myState'] != 2:
            time.sleep(1)
        while retry_until_ok(mongos['test']['test'].find().count) != 1:
            time.sleep(1)

        self.assertEqual(str(new_primary_conn.port), PORTS_ONE['SECONDARY'])
        self.assertEqual(str(primary_conn.port), PORTS_ONE['PRIMARY'])

        last_ts = test_oplog.get_last_oplog_timestamp()
        second_doc = {'name': 'paul', '_ts': bson_ts_to_long(last_ts),
                      'ns': 'test.test', 
                      '_id': ObjectId('4ff74db3f646462b38000002')}

        test_oplog.doc_manager.upsert(first_doc)
        test_oplog.doc_manager.upsert(second_doc)

        test_oplog.rollback()
        test_oplog.doc_manager.commit()
        results = solr._search()

        assert(len(results) == 1)

        self.assertEqual(results[0]['name'], 'paulie')
        self.assertTrue(results[0]['_ts'] <= bson_ts_to_long(cutoff_ts))
예제 #21
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and True if the cursor is empty.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, True
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return cursor, retry_until_ok(self._cursor_empty, cursor)

        self.checkpoint = timestamp
        self.update_checkpoint()

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_empty = retry_until_ok(self._cursor_empty, cursor)

            if cursor_empty:
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = retry_until_ok(next, cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the next(cursor) call and now
                time.sleep(1)
                continue

            oldest_ts_long = util.bson_ts_to_long(
                self.get_oldest_oplog_timestamp())
            checkpoint_ts_long = util.bson_ts_to_long(timestamp)
            if checkpoint_ts_long < oldest_ts_long:
                # We've fallen behind, the checkpoint has fallen off the oplog
                return None, True

            cursor_ts_long = util.bson_ts_to_long(first_oplog_entry["ts"])
            if cursor_ts_long > checkpoint_ts_long:
                # The checkpoint is not present in this oplog and the oplog
                # did not rollover. This means that we connected to a new
                # primary which did not replicate the checkpoint and which has
                # new changes in its oplog for us to process.
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor: new oplog entries found but "
                          "checkpoint is not present")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # first entry has been consumed
            return cursor, cursor_empty

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
예제 #22
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        dump_set = self.namespace_set or []
        LOG.debug("OplogThread: Dumping set of collections %s " % dump_set)

        # No namespaces specified
        if not self.namespace_set:
            db_list = retry_until_ok(self.primary_client.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.primary_client[database].collection_names)
                for coll in coll_list:
                    # ignore system collections
                    if coll.startswith("system."):
                        continue
                    # ignore gridfs collections
                    if coll.endswith(".files") or coll.endswith(".chunks"):
                        continue
                    namespace = "%s.%s" % (database, coll)
                    dump_set.append(namespace)

        def docs_to_dump(namespace):
            database, coll = namespace.split('.', 1)
            last_id = None
            attempts = 0

            # Loop to handle possible AutoReconnect
            while attempts < 60:
                target_coll = self.primary_client[database][coll]
                if not last_id:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        projection=self._projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                else:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        {"_id": {"$gt": last_id}},
                        projection=self._projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_each(dm):
            num_inserted = 0
            num_failed = 0
            for namespace in dump_set:
                for num, doc in enumerate(docs_to_dump(namespace)):
                    if num % 10000 == 0:
                        LOG.debug("Upserted %d docs." % num)
                    try:
                        mapped_ns = self.dest_mapping.get(namespace, namespace)
                        dm.upsert(doc, mapped_ns, long_ts)
                        num_inserted += 1
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception(
                                "Could not upsert document: %r" % doc)
                            num_failed += 1
                        else:
                            raise
            LOG.debug("Upserted %d docs" % num_inserted)
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    mapped_ns = self.dest_mapping.get(namespace, namespace)
                    dm.bulk_upsert(docs_to_dump(namespace), mapped_ns, long_ts)
            except Exception:
                if self.continue_on_error:
                    LOG.exception("OplogThread: caught exception"
                                  " during bulk upsert, re-upserting"
                                  " documents serially")
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                # Dump the documents, bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    LOG.debug("OplogThread: Using bulk upsert function for "
                              "collection dump")
                    upsert_all(dm)
                else:
                    LOG.debug(
                        "OplogThread: DocManager %s has no "
                        "bulk_upsert method.  Upserting documents "
                        "serially for collection dump." % str(dm))
                    upsert_each(dm)

                # Dump GridFS files
                for gridfs_ns in self.gridfs_set:
                    db, coll = gridfs_ns.split('.', 1)
                    mongo_coll = self.primary_client[db][coll]
                    dest_ns = self.dest_mapping.get(gridfs_ns, gridfs_ns)
                    for doc in docs_to_dump(gridfs_ns + '.files'):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical('Exception during collection dump',
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
예제 #23
0
    def run(self):
        """Start the oplog worker.
        """
        while self.running is True:
            cursor = self.init_cursor()

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogManager: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                time.sleep(1)
                continue

            last_ts = None
            err = False
            try:
                while cursor.alive and self.running:
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        #check if ns is excluded or not.
                        #also ensure non-empty namespace set.
                        if (ns not in self.namespace_set
                                and self.namespace_set):
                            continue

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        dm.upsert(self.filter_fields(doc))
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                self.checkpoint = last_ts
                self.update_checkpoint()

            time.sleep(2)
예제 #24
0
    def run(self):
        """Start the oplog worker.
        """
        logging.debug("OplogThread: Run thread started")
        while self.running is True:
            logging.debug("OplogThread: Getting cursor")
            cursor = self.init_cursor()
            logging.debug("OplogThread: Got the cursor, go go go!")

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                logging.debug("OplogThread: Last entry is the one we "
                              "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            err = False
            remove_inc = 0
            upsert_inc = 0
            try:
                logging.debug("OplogThread: about to process new oplog "
                              "entries")
                while cursor.alive and self.running:
                    logging.debug("OplogThread: Cursor is still"
                                  " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        logging.debug("OplogThread: Iterating through cursor,"
                                      " document number in this cursor is %d"
                                      % n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            logging.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    remove_inc += 1
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        upsert_inc += 1
                                        dm.upsert(doc)
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" % (
                                    "delete" if operation == "d" else "upsert",
                                    str(entry['_id'])
                                ))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" % (
                                    "delete" if operation == "d" else "upsert",
                                    str(entry['_id'])
                                ))

                        if (remove_inc + upsert_inc) % 1000 == 0:
                            logging.debug("OplogThread: Removed %d "
                                          " documents and upserted %d "
                                          " documents so far"
                                          % (remove_inc, upsert_inc))

                        logging.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        logging.debug("OplogThread: updating checkpoint after"
                                      "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                logging.debug("OplogThread: updating checkpoint after an "
                              "Exception, cursor closing, or join() on this"
                              "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            logging.debug("OplogThread: Sleeping.  This batch I removed %d "
                          " documents and I upserted %d documents."
                          % (remove_inc, upsert_inc))
            time.sleep(2)
예제 #25
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        # Find the most recently inserted document in each target system
        LOG.debug(
            "OplogThread: Initiating rollback sequence to bring "
            "system into a consistent state."
        )
        last_docs = []
        for dm in self.doc_managers:
            dm.commit()
            last_docs.append(dm.get_last_doc())

        # Of these documents, which is the most recent?
        last_inserted_doc = max(
            last_docs, key=lambda x: x["_ts"] if x else float("-inf")
        )

        # Nothing has been replicated. No need to rollback target systems
        if last_inserted_doc is None:
            return None

        # Find the oplog entry that touched the most recent document.
        # We'll use this to figure where to pick up the oplog later.
        target_ts = util.long_to_bson_ts(last_inserted_doc["_ts"])
        last_oplog_entry = util.retry_until_ok(
            self.oplog.find_one,
            {"ts": {"$lte": target_ts}, "op": {"$ne": "n"}},
            sort=[("$natural", pymongo.DESCENDING)],
        )

        LOG.debug("OplogThread: last oplog entry is %s" % str(last_oplog_entry))

        # The oplog entry for the most recent document doesn't exist anymore.
        # If we've fallen behind in the oplog, this will be caught later
        if last_oplog_entry is None:
            return None

        # rollback_cutoff_ts happened *before* the rollback
        rollback_cutoff_ts = last_oplog_entry["ts"]
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        # timestamp of the most recent document on any target system
        end_ts = last_inserted_doc["_ts"]

        for dm in self.doc_managers:
            rollback_set = {}  # this is a dictionary of ns:list of docs

            # group potentially conflicted documents by namespace
            for doc in dm.search(start_ts, end_ts):
                if doc["ns"] in rollback_set:
                    rollback_set[doc["ns"]].append(doc)
                else:
                    rollback_set[doc["ns"]] = [doc]

            # retrieve these documents from MongoDB, either updating
            # or removing them in each target system
            for namespace, doc_list in rollback_set.items():
                # Get the original namespace
                original_namespace = self.namespace_config.unmap_namespace(namespace)
                if not original_namespace:
                    original_namespace = namespace

                database, coll = original_namespace.split(".", 1)
                obj_id = bson.objectid.ObjectId
                bson_obj_id_list = [obj_id(doc["_id"]) for doc in doc_list]

                # Use connection to whole cluster if in sharded environment.
                client = self.mongos_client or self.primary_client
                to_update = util.retry_until_ok(
                    client[database][coll].find,
                    {"_id": {"$in": bson_obj_id_list}},
                    projection=self.namespace_config.projection(original_namespace),
                )
                # Doc list are docs in target system, to_update are
                # Docs in mongo
                doc_hash = {}  # Hash by _id
                for doc in doc_list:
                    doc_hash[bson.objectid.ObjectId(doc["_id"])] = doc

                to_index = []

                def collect_existing_docs():
                    for doc in to_update:
                        if doc["_id"] in doc_hash:
                            del doc_hash[doc["_id"]]
                            to_index.append(doc)

                retry_until_ok(collect_existing_docs)

                # Delete the inconsistent documents
                LOG.debug("OplogThread: Rollback, removing inconsistent " "docs.")
                remov_inc = 0
                for document_id in doc_hash:
                    try:
                        dm.remove(
                            document_id,
                            namespace,
                            util.bson_ts_to_long(rollback_cutoff_ts),
                        )
                        remov_inc += 1
                        LOG.debug("OplogThread: Rollback, removed %r " % doc)
                    except errors.OperationFailed:
                        LOG.warning(
                            "Could not delete document during rollback: %r "
                            "This can happen if this document was already "
                            "removed by another rollback happening at the "
                            "same time." % doc
                        )

                LOG.debug("OplogThread: Rollback, removed %d docs." % remov_inc)

                # Insert the ones from mongo
                LOG.debug("OplogThread: Rollback, inserting documents " "from mongo.")
                insert_inc = 0
                fail_insert_inc = 0
                for doc in to_index:
                    try:
                        insert_inc += 1
                        dm.upsert(
                            doc, namespace, util.bson_ts_to_long(rollback_cutoff_ts)
                        )
                    except errors.OperationFailed:
                        fail_insert_inc += 1
                        LOG.exception(
                            "OplogThread: Rollback, Unable to " "insert %r" % doc
                        )

        LOG.debug(
            "OplogThread: Rollback, Successfully inserted %d "
            " documents and failed to insert %d"
            " documents.  Returning a rollback cutoff time of %s "
            % (insert_inc, fail_insert_inc, str(rollback_cutoff_ts))
        )

        return rollback_cutoff_ts
예제 #26
0
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error("%s %s %s" % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug(
                    "OplogThread: Last entry is the one we "
                    "already processed.  Up to date.  Sleeping."
                )
                time.sleep(1)
                continue

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug(
                        "OplogThread: Cursor is still"
                        " alive and thread is still running."
                    )
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug(
                            "OplogThread: Iterating through cursor,"
                            " document number in this cursor is %d" % n
                        )

                        skip, is_gridfs_file = self._should_skip_entry(entry)
                        if skip:
                            # update the last_ts on skipped entries to ensure
                            # our checkpoint does not fall off the oplog. This
                            # also prevents reprocessing skipped entries.
                            last_ts = entry["ts"]
                            continue

                        # Sync the current oplog operation
                        operation = entry["op"]
                        ns = entry["ns"]
                        timestamp = util.bson_ts_to_long(entry["ts"])
                        for docman in self.doc_managers:
                            try:
                                LOG.debug(
                                    "OplogThread: Operation for this "
                                    "entry is %s" % str(operation)
                                )

                                # Remove
                                if operation == "d":
                                    docman.remove(entry["o"]["_id"], ns, timestamp)
                                    remove_inc += 1

                                # Insert
                                elif operation == "i":  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get("o")
                                    # Extract timestamp and namespace
                                    if is_gridfs_file:
                                        db, coll = ns.split(".", 1)
                                        gridfile = GridFSFile(
                                            self.primary_client[db][coll], doc
                                        )
                                        docman.insert_file(gridfile, ns, timestamp)
                                    else:
                                        docman.upsert(doc, ns, timestamp)
                                    upsert_inc += 1

                                # Update
                                elif operation == "u":
                                    docman.update(
                                        entry["o2"]["_id"], entry["o"], ns, timestamp
                                    )
                                    update_inc += 1

                                # Command
                                elif operation == "c":
                                    # use unmapped namespace
                                    doc = entry.get("o")
                                    docman.handle_command(doc, entry["ns"], timestamp)

                            except errors.OperationFailed:
                                LOG.exception(
                                    "Unable to process oplog document %r" % entry
                                )
                            except errors.ConnectionFailed:
                                LOG.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry
                                )

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            LOG.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far"
                                % (remove_inc, upsert_inc, update_inc)
                            )

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry["ts"]

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1:
                            self.update_checkpoint(last_ts)
                            last_ts = None

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug(
                            "OplogThread: updating checkpoint after "
                            "processing new oplog entries"
                        )
                        self.update_checkpoint(last_ts)

            except (
                pymongo.errors.AutoReconnect,
                pymongo.errors.OperationFailure,
                pymongo.errors.ConfigurationError,
            ):
                LOG.exception(
                    "Cursor closed due to an exception. " "Will attempt to reconnect."
                )

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug(
                    "OplogThread: updating checkpoint after an "
                    "Exception, cursor closing, or join() on this"
                    "thread."
                )
                self.update_checkpoint(last_ts)

            LOG.debug(
                "OplogThread: Sleeping. Documents removed: %d, "
                "upserted: %d, updated: %d" % (remove_inc, upsert_inc, update_inc)
            )
            time.sleep(2)
예제 #27
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set or []
        logging.debug("OplogThread: Dumping set of collections %s " % dump_set)

        #no namespaces specified
        if not self.namespace_set:
            db_list = retry_until_ok(self.main_connection.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.main_connection[database].collection_names)
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = "%s.%s" % (database, coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump():
            for namespace in dump_set:
                logging.info("OplogThread: dumping collection %s"
                             % namespace)
                database, coll = namespace.split('.', 1)
                last_id = None
                attempts = 0

                # Loop to handle possible AutoReconnect
                while attempts < 60:
                    target_coll = self.main_connection[database][coll]
                    if not last_id:
                        cursor = util.retry_until_ok(
                            target_coll.find,
                            fields=self._fields,
                            sort=[("_id", pymongo.ASCENDING)]
                        )
                    else:
                        cursor = util.retry_until_ok(
                            target_coll.find,
                            {"_id": {"$gt": last_id}},
                            fields=self._fields,
                            sort=[("_id", pymongo.ASCENDING)]
                        )
                    try:
                        for doc in cursor:
                            if not self.running:
                                raise StopIteration
                            doc["ns"] = self.dest_mapping.get(
                                namespace, namespace)
                            doc["_ts"] = long_ts
                            last_id = doc["_id"]
                            yield doc
                        break
                    except pymongo.errors.AutoReconnect:
                        attempts += 1
                        time.sleep(1)

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()
        try:
            for dm in self.doc_managers:
                # Bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    logging.debug("OplogThread: Using bulk upsert function for"
                                  "collection dump")
                    # Slight performance gain breaking dump into separate
                    # threads, only if > 1 replication target
                    if len(self.doc_managers) == 1:
                        dm.bulk_upsert(docs_to_dump())
                    else:
                        def do_dump(error_queue):
                            all_docs = docs_to_dump()
                            try:
                                dm.bulk_upsert(all_docs)
                            except Exception:
                                # Likely exceptions:
                                # pymongo.errors.OperationFailure,
                                # mongo_connector.errors.ConnectionFailed
                                # mongo_connector.errors.OperationFailed
                                error_queue.put(sys.exc_info())

                        t = threading.Thread(target=do_dump, args=(errors,))
                        dumping_threads.append(t)
                        t.start()
                else:
                    logging.debug("OplogThread: DocManager %s has not"
                                  "bulk_upsert method.  Upserting documents "
                                  "serially for collection dump." % str(dm))
                    num = 0
                    for num, doc in enumerate(docs_to_dump()):
                        if num % 10000 == 0:
                            logging.debug("Upserted %d docs." % num)
                        dm.upsert(doc)
                    logging.debug("Upserted %d docs" % num)

            # cleanup
            for t in dumping_threads:
                t.join()

        except Exception:
            # See "likely exceptions" comment above
            errors.put(sys.exc_info())

        # Print caught exceptions
        try:
            while True:
                klass, value, trace = errors.get_nowait()
                dump_success = False
                traceback.print_exception(klass, value, trace)
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            logging.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
예제 #28
0
    def run(self):
        """Start the oplog worker.
        """
        logging.debug("OplogThread: Run thread started")
        while self.running is True:
            logging.debug("OplogThread: Getting cursor")
            cursor = self.init_cursor()
            logging.debug("OplogThread: Got the cursor, go go go!")

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                logging.debug("OplogThread: Last entry is the one we "
                              "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            err = False
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                logging.debug("OplogThread: about to process new oplog "
                              "entries")
                while cursor.alive and self.running:
                    logging.debug("OplogThread: Cursor is still"
                                  " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        logging.debug("OplogThread: Iterating through cursor,"
                                      " document number in this cursor is %d"
                                      % n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        # Take fields out of the oplog entry that
                        # shouldn't be replicated. This may nullify
                        # the document if there's nothing to do.
                        if not self.filter_oplog_entry(entry):
                            continue

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        for docman in self.doc_managers:
                            try:
                                logging.debug("OplogThread: Operation for this "
                                              "entry is %s" % str(operation))

                                # Remove
                                if operation == 'd':
                                    entry['_id'] = entry['o']['_id']
                                    entry['ns'] = ns
                                    docman.remove(entry)
                                    remove_inc += 1
                                # Insert
                                elif operation == 'i':  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get('o')
                                    # Extract timestamp and namespace
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    docman.upsert(doc)
                                    upsert_inc += 1
                                # Update
                                elif operation == 'u':
                                    doc = {"_id": entry['o2']['_id'],
                                           "_ts": util.bson_ts_to_long(
                                               entry['ts']),
                                           "ns": ns}
                                    # 'o' field contains the update spec
                                    docman.update(doc, entry.get('o', {}))
                                    update_inc += 1
                            except errors.OperationFailed:
                                logging.exception(
                                    "Unable to process oplog document %r"
                                    % entry)
                            except errors.ConnectionFailed:
                                logging.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            logging.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far" % (
                                    remove_inc, upsert_inc, update_inc))

                        logging.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        logging.debug("OplogThread: updating checkpoint after"
                                      "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                logging.exception(
                    "Cursor closed due to an exception. "
                    "Will attempt to reconnect.")
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                logging.debug("OplogThread: updating checkpoint after an "
                              "Exception, cursor closing, or join() on this"
                              "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            logging.debug("OplogThread: Sleeping. Documents removed: %d, "
                          "upserted: %d, updated: %d"
                          % (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
예제 #29
0
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")

        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug("OplogThread: Last entry is the one we "
                          "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug("OplogThread: Cursor is still"
                              " alive and thread is still running.")
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug("OplogThread: Iterating through cursor,"
                                  " document number in this cursor is %d" % n)

                        skip, is_gridfs_file = self._should_skip_entry(entry)
                        if skip:
                            # update the last_ts on skipped entries to ensure
                            # our checkpoint does not fall off the oplog. This
                            # also prevents reprocessing skipped entries.
                            last_ts = entry['ts']
                            continue

                        op_add = 0
                        op_remove = 0
                        op_update = 0

                        # Sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']
                        timestamp = util.bson_ts_to_long(entry['ts'])
                        for docman in self.doc_managers:

                            @self.ERROR_TIME.time()
                            def process_exception(metric):
                                metric.inc()

                            try:
                                LOG.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))

                                # Remove
                                if operation == 'd':
                                    docman.remove(entry['o']['_id'], ns,
                                                  timestamp)
                                    remove_inc += 1
                                    op_remove += 1

                                # Insert
                                elif operation == 'i':  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get('o')
                                    # Extract timestamp and namespace
                                    if is_gridfs_file:
                                        db, coll = ns.split('.', 1)
                                        gridfile = GridFSFile(
                                            self.primary_client[db][coll], doc)
                                        docman.insert_file(
                                            gridfile, ns, timestamp)
                                    else:
                                        docman.upsert(doc, ns, timestamp)
                                    upsert_inc += 1
                                    op_add += 1

                                # Update
                                elif operation == 'u':
                                    docman.update(entry['o2']['_id'],
                                                  entry['o'], ns, timestamp)
                                    update_inc += 1
                                    op_update += 1

                                # Command
                                elif operation == 'c':
                                    # use unmapped namespace
                                    doc = entry.get('o')
                                    docman.handle_command(
                                        doc, entry['ns'], timestamp)

                            except errors.OperationFailed:
                                # Remove
                                if operation == 'd':
                                    if op_remove > 0:
                                        op_remove -= 1
                                # Insert
                                elif operation == 'i':
                                    if op_add > 0:
                                        op_add -= 1
                                # Update
                                elif operation == 'u':
                                    if op_update > 0:
                                        op_update -= 1

                                process_exception(
                                    self.error_caught.labels(
                                        'cannot_process_doc',
                                        errors.OperationFailed))

                                LOG.exception(
                                    "Unable to process oplog document %r" %
                                    entry)
                            except errors.ConnectionFailed:
                                # Remove
                                if operation == 'd':
                                    if op_remove > 0:
                                        op_remove -= 1
                                # Insert
                                elif operation == 'i':
                                    if op_add > 0:
                                        op_add -= 1
                                # Update
                                elif operation == 'u':
                                    if op_update > 0:
                                        op_update -= 1

                                process_exception(
                                    self.error_caught.labels(
                                        'connection_failed',
                                        errors.ConnectionFailed))

                                LOG.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            LOG.debug("OplogThread: Documents removed: %d, "
                                      "inserted: %d, updated: %d so far" %
                                      (remove_inc, upsert_inc, update_inc))

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1:
                            self.update_checkpoint(last_ts)
                            last_ts = None

                        LOG.always("Counter: Documents removed: %d, "
                                   "inserted: %d, updated: %d so far" %
                                   (op_remove, op_add, op_update))

                        # TODO: Add collection name as label
                        @self.REQUEST_TIME.time()
                        def process_request(add, remove, update):
                            self.doc_operation_count.labels('add').inc(add)
                            self.doc_operation_count.labels('remove').inc(
                                remove)
                            self.doc_operation_count.labels('update').inc(
                                update)

                        process_request(op_add, op_remove, op_update)

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug("OplogThread: updating checkpoint after "
                                  "processing new oplog entries")
                        self.update_checkpoint(last_ts)

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                LOG.exception("Cursor closed due to an exception. "
                              "Will attempt to reconnect.")

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug("OplogThread: updating checkpoint after an "
                          "Exception, cursor closing, or join() on this"
                          "thread.")
                self.update_checkpoint(last_ts)

            LOG.debug("OplogThread: Sleeping. Documents removed: %d, "
                      "upserted: %d, updated: %d" %
                      (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
import pymongo
import bson
import time
import sys
from mongo_connector import util

mongo_url = 'mongodb://localhost:27017'
if len(sys.argv) == 1:
    print "First argument is mongodb connection string, i.e. localhost:27017. Assuming localhost:27017..."
if len(sys.argv) >= 2:
    mongo_url = sys.argv[1]

client = pymongo.MongoClient(mongo_url)
rs_name = client.admin.command('ismaster')['setName']
print('Found Replica Set name: {}'.format(str(rs_name)))

print('Now checking for the latest oplog entry...')
oplog = client.local.oplog.rs
last_oplog = oplog.find().sort('$natural', pymongo.DESCENDING).limit(-1).next()
print('Found the last oplog ts: {}'.format(str(last_oplog['ts'])))
last_ts = util.bson_ts_to_long(last_oplog['ts'])
out_str = '["{}", {}]'.format(str(rs_name), str(last_ts))

print(
    'Writing all to file oplog.timestamp.last in the format required for mongo-connector'
)
f = open('./oplog.timestamp.last', 'w')
f.write(out_str)
f.close()
print('All done!')
예제 #31
0
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug("OplogThread: Last entry is the one we "
                          "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug("OplogThread: Cursor is still"
                              " alive and thread is still running.")
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug("OplogThread: Iterating through cursor,"
                                  " document number in this cursor is %d"
                                  % n)

                        skip, is_gridfs_file = self._should_skip_entry(entry)
                        if skip:
                            # update the last_ts on skipped entries to ensure
                            # our checkpoint does not fall off the oplog. This
                            # also prevents reprocessing skipped entries.
                            last_ts = entry['ts']
                            continue

                        # Sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']
                        timestamp = util.bson_ts_to_long(entry['ts'])
                        for docman in self.doc_managers:
                            try:
                                LOG.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))

                                # Remove
                                if operation == 'd':
                                    docman.remove(
                                        entry['o']['_id'], ns, timestamp)
                                    remove_inc += 1

                                # Insert
                                elif operation == 'i':  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get('o')
                                    # Extract timestamp and namespace
                                    if is_gridfs_file:
                                        db, coll = ns.split('.', 1)
                                        gridfile = GridFSFile(
                                            self.primary_client[db][coll],
                                            doc)
                                        docman.insert_file(
                                            gridfile, ns, timestamp)
                                    else:
                                        docman.upsert(doc, ns, timestamp)
                                    upsert_inc += 1

                                # Update
                                elif operation == 'u':
                                    docman.update(entry['o2']['_id'],
                                                  entry['o'],
                                                  ns, timestamp)
                                    update_inc += 1

                                # Command
                                elif operation == 'c':
                                    # use unmapped namespace
                                    doc = entry.get('o')
                                    docman.handle_command(doc,
                                                          entry['ns'],
                                                          timestamp)

                            except errors.OperationFailed:
                                LOG.exception(
                                    "Unable to process oplog document %r"
                                    % entry)
                            except errors.ConnectionFailed:
                                LOG.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            LOG.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far" % (
                                    remove_inc, upsert_inc, update_inc))

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1:
                            self.update_checkpoint(last_ts)
                            last_ts = None

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug("OplogThread: updating checkpoint after "
                                  "processing new oplog entries")
                        self.update_checkpoint(last_ts)

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                LOG.exception(
                    "Cursor closed due to an exception. "
                    "Will attempt to reconnect.")

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug("OplogThread: updating checkpoint after an "
                          "Exception, cursor closing, or join() on this"
                          "thread.")
                self.update_checkpoint(last_ts)

            LOG.debug("OplogThread: Sleeping. Documents removed: %d, "
                      "upserted: %d, updated: %d"
                      % (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
예제 #32
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        timestamp = retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)
        # Flag if this oplog thread was cancelled during the collection dump.
        # Use a list to workaround python scoping.
        dump_cancelled = [False]

        def get_all_ns():
            ns_set = []
            gridfs_ns_set = []
            db_list = retry_until_ok(self.primary_client.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.primary_client[database].collection_names)
                for coll in coll_list:
                    # ignore system collections
                    if coll.startswith("system."):
                        continue
                    # ignore gridfs chunks collections
                    if coll.endswith(".chunks"):
                        continue
                    if coll.endswith(".files"):
                        namespace = "%s.%s" % (database, coll)
                        namespace = namespace[:-len(".files")]
                        if self.namespace_config.gridfs_namespace(namespace):
                            gridfs_ns_set.append(namespace)
                    else:
                        namespace = "%s.%s" % (database, coll)
                        if self.namespace_config.map_namespace(namespace):
                            ns_set.append(namespace)
            return ns_set, gridfs_ns_set

        dump_set, gridfs_dump_set = get_all_ns()

        LOG.debug("OplogThread: Dumping set of collections %s " % dump_set)

        def docs_to_dump(from_coll):
            last_id = None
            attempts = 0
            projection = self.namespace_config.projection(from_coll.full_name)
            # Loop to handle possible AutoReconnect
            while attempts < 60:
                if last_id is None:
                    cursor = retry_until_ok(
                        from_coll.find,
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                else:
                    cursor = retry_until_ok(
                        from_coll.find,
                        {"_id": {"$gt": last_id}},
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            # Thread was joined while performing the
                            # collection dump.
                            dump_cancelled[0] = True
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_each(dm):
            num_failed = 0
            for namespace in dump_set:
                from_coll = self.get_collection(namespace)
                mapped_ns = self.namespace_config.map_namespace(namespace)
                total_docs = retry_until_ok(from_coll.count)
                num = None
                for num, doc in enumerate(docs_to_dump(from_coll)):
                    try:
                        dm.upsert(doc, mapped_ns, long_ts)
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception(
                                "Could not upsert document: %r" % doc)
                            num_failed += 1
                        else:
                            raise
                    if num % 10000 == 0:
                        LOG.info("Upserted %d out of approximately %d docs "
                                 "from collection '%s'",
                                 num + 1, total_docs, namespace)
                if num is not None:
                    LOG.info("Upserted %d out of approximately %d docs from "
                             "collection '%s'",
                             num + 1, total_docs, namespace)
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    from_coll = self.get_collection(namespace)
                    total_docs = retry_until_ok(from_coll.count)
                    mapped_ns = self.namespace_config.map_namespace(
                            namespace)
                    LOG.info("Bulk upserting approximately %d docs from "
                             "collection '%s'",
                             total_docs, namespace)
                    dm.bulk_upsert(docs_to_dump(from_coll),
                                   mapped_ns, long_ts)
            except Exception:
                if self.continue_on_error:
                    LOG.exception("OplogThread: caught exception"
                                  " during bulk upsert, re-upserting"
                                  " documents serially")
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                LOG.debug("OplogThread: Using bulk upsert function for "
                          "collection dump")
                upsert_all(dm)

                if gridfs_dump_set:
                    LOG.info("OplogThread: dumping GridFS collections: %s",
                             gridfs_dump_set)

                # Dump GridFS files
                for gridfs_ns in gridfs_dump_set:
                    mongo_coll = self.get_collection(gridfs_ns)
                    from_coll = self.get_collection(gridfs_ns + '.files')
                    dest_ns = self.namespace_config.map_namespace(gridfs_ns)
                    for doc in docs_to_dump(from_coll):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical('Exception during collection dump',
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        if dump_cancelled[0]:
            LOG.warning('Initial collection dump was interrupted. '
                        'Will re-run the collection dump on next startup.')
            return None

        return timestamp
예제 #33
0
    def run(self):
        """Start the oplog worker.
        """
        logging.debug("OplogThread: Run thread started")
        while self.running is True:
            logging.debug("OplogThread: Getting cursor")
            cursor = self.init_cursor()
            logging.debug("OplogThread: Got the cursor, go go go!")

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                logging.debug("OplogThread: Last entry is the one we "
                              "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            err = False
            remove_inc = 0
            upsert_inc = 0
            try:
                logging.debug("OplogThread: about to process new oplog "
                              "entries")
                while cursor.alive and self.running:
                    logging.debug("OplogThread: Cursor is still"
                                  " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        logging.debug("OplogThread: Iterating through cursor,"
                                      " document number in this cursor is %d" %
                                      n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            logging.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    remove_inc += 1
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        upsert_inc += 1
                                        dm.upsert(doc)
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))

                        if (remove_inc + upsert_inc) % 1000 == 0:
                            logging.debug("OplogThread: Removed %d "
                                          " documents and upserted %d "
                                          " documents so far" %
                                          (remove_inc, upsert_inc))

                        logging.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        logging.debug("OplogThread: updating checkpoint after"
                                      "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                logging.debug("OplogThread: updating checkpoint after an "
                              "Exception, cursor closing, or join() on this"
                              "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            logging.debug("OplogThread: Sleeping.  This batch I removed %d "
                          " documents and I upserted %d documents." %
                          (remove_inc, upsert_inc))
            time.sleep(2)
예제 #34
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.get_dump_set()
        shard_doc_count = self.shard_doc_count()
        timestamp = None
        long_ts = 0

        LOG.info("OplogThread: Dumping set of collections %s from shard %s with %d docs" % (dump_set, self.shard_name, shard_doc_count))
        if shard_doc_count > 0:
            timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
            if timestamp:
                long_ts = util.bson_ts_to_long(timestamp)

        def query_handle_id(query):
            if '_id' in query and query['_id']:
                for key in query['_id']:
                    query['_id'][key] = ObjectId(query['_id'][key])

        def docs_to_dump(namespace):
            database, coll = namespace.split('.', 1)
            last_id = None
            attempts = 0

            # Loop to handle possible AutoReconnect
            while attempts < 60:
                target_coll = self.primary_client[database][coll]
                fields_to_fetch = None
                if 'include' in self._fields:
                    fields_to_fetch = self._fields['include']
                query = {}
                if self.initial_import['query']:
                    query_handle_id(self.initial_import['query'])
                    query = self.initial_import['query']
                if not last_id:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        query,
                        fields=fields_to_fetch,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                else:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        query.update({"_id": {"$gt": last_id}}),
                        fields=fields_to_fetch,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            LOG.error("Stopped iterating over Cursor while initial import")
                            raise StopIteration
                        self.pop_excluded_fields(doc)
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_all_failed_docs(dm, namespace, errors):
            if errors:
                for _id, field in errors:
                    self.upsert_doc(dm, namespace, long_ts, _id, field)

        def upsert_each(dm):
            num_inserted = 0
            num_failed = 0
            for namespace in dump_set:
                mapped_ns = self.dest_mapping.get(namespace, namespace)
                dm.disable_refresh(mapped_ns)
                for num, doc in enumerate(docs_to_dump(namespace)):
                    if num % 10000 == 0:
                        LOG.info("Upserted %d docs." % num)
                    try:
                        self.upsert_doc(dm, namespace, long_ts, None, None, doc)
                        num_inserted += 1
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception("Could not upsert document: %r" % doc)
                            num_failed += 1
                        else:
                            raise
                dm.enable_refresh(mapped_ns)
            LOG.info("Upserted %d docs" % num_inserted)
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    mapped_ns = self.dest_mapping.get(namespace, namespace)
                    dm.disable_refresh(mapped_ns)
                    errors = dm.bulk_upsert(docs_to_dump(namespace), mapped_ns, long_ts)
                    upsert_all_failed_docs(dm, namespace, errors)
                    dm.enable_refresh(mapped_ns)
            except Exception:
                if self.continue_on_error:
                    LOG.exception("OplogThread: caught exception"
                                  " during bulk upsert, re-upserting"
                                  " documents serially")
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                # Dump the documents, bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    LOG.debug("OplogThread: Using bulk upsert function for "
                              "collection dump")
                    upsert_all(dm)
                else:
                    LOG.debug(
                        "OplogThread: DocManager %s has no "
                        "bulk_upsert method.  Upserting documents "
                        "serially for collection dump." % str(dm))
                    upsert_each(dm)

                # Dump GridFS files
                for gridfs_ns in self.gridfs_set:
                    db, coll = gridfs_ns.split('.', 1)
                    mongo_coll = self.primary_client[db][coll]
                    dest_ns = self.dest_mapping.get(gridfs_ns, gridfs_ns)
                    for doc in docs_to_dump(gridfs_ns + '.files'):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical('Exception during collection dump',
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None
        else:
            LOG.info('OplogThread: Successfully dumped collection')
            LOG.warning('Fields with errors: %r' % self.error_fields)
            if not self.reimporting:
                LOG.info("Waiting for other threads to finish collection dump")
                self.barrier.wait()

        return timestamp
예제 #35
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """
        timestamp = self.read_last_checkpoint()

        LOG.info("Initializing cursor with initial timestamp: %r" % timestamp)

        dump_set = self.get_dump_set()

        for dm in self.doc_managers:
            for namespace in dump_set:
                mapped_ns = self.dest_mapping.get(namespace, namespace)
                if not dm.index_exists(mapped_ns):
                    dm.index_create(mapped_ns)

        if timestamp is None:
            if self.initial_import['dump']:
                # dump collection and update checkpoint
                LOG.info("INITIAL IMPORT: Starting initial import of data")
                timestamp = self.dump_collection()
                if timestamp is not None:
                    for dm in self.doc_managers:
                        for namespace in dump_set:
                            mapped_ns = self.dest_mapping.get(namespace, namespace)
                            dm.index_alias_add(mapped_ns, namespace)
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                LOG.info("INITIAL IMPORT: Initial import skipped, creating oplog cursor")
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return cursor
        else:
            LOG.info("Last checkpoint found from timestamp file, resuming oplog tailing from timestamp: %r" % timestamp)

        self.checkpoint = timestamp
        self.update_checkpoint()

        if timestamp is None:
            LOG.info("No documents found in shard: %s, creating cursor to tail oplog" % self.shard_name)
            return self.get_oplog_cursor()

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)

            # try to get the first oplog entry
            try:
                first_oplog_entry = retry_until_ok(next, cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the cursor.count() call and now
                time.sleep(1)
                continue

            # first entry should be last oplog entry processed
            LOG.info("Retrieved first oplog entry: %r" % first_oplog_entry)
            cursor_ts_long = util.bson_ts_to_long(
                first_oplog_entry.get("ts"))
            given_ts_long = util.bson_ts_to_long(timestamp)
            if cursor_ts_long > given_ts_long:
                # first entry in oplog is beyond timestamp
                # we've fallen behind
                LOG.critical("Oplog Cursor has fallen behind, reimporting data")
                self.clear_checkpoint()
                self.reimporting = True
                self.initial_import['dump'] = True
                return None

            # first entry has been consumed
            LOG.info("Created oplog cursor")
            return cursor

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
예제 #36
0
    def run(self):
        """Start the oplog worker.
        """
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_len = self.init_cursor()

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_len == 0:
                LOG.debug("OplogThread: Last entry is the one we "
                          "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            LOG.debug("OplogThread: Got the cursor, count is %d"
                      % cursor_len)

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog "
                          "entries")
                while cursor.alive and self.running:
                    LOG.debug("OplogThread: Cursor is still"
                              " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        LOG.debug("OplogThread: Iterating through cursor,"
                                  " document number in this cursor is %d"
                                  % n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        # Take fields out of the oplog entry that
                        # shouldn't be replicated. This may nullify
                        # the document if there's nothing to do.
                        if not self.filter_oplog_entry(entry):
                            continue

                        # Sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        if '.' not in ns:
                            continue
                        coll = ns.split('.', 1)[1]

                        # Ignore system collections
                        if coll.startswith("system."):
                            continue

                        # Ignore GridFS chunks
                        if coll.endswith('.chunks'):
                            continue

                        is_gridfs_file = False
                        if coll.endswith(".files"):
                            if ns in self.gridfs_files_set:
                                ns = ns[:-len(".files")]
                                is_gridfs_file = True
                            else:
                                continue

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(ns, ns)
                        timestamp = util.bson_ts_to_long(entry['ts'])
                        for docman in self.doc_managers:
                            try:
                                LOG.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))

                                # Remove
                                if operation == 'd':
                                    docman.remove(
                                        entry['o']['_id'], ns, timestamp)
                                    remove_inc += 1

                                # Insert
                                elif operation == 'i':  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get('o')
                                    # Extract timestamp and namespace
                                    if is_gridfs_file:
                                        db, coll = ns.split('.', 1)
                                        gridfile = GridFSFile(
                                            self.primary_client[db][coll],
                                            doc)
                                        docman.insert_file(
                                            gridfile, ns, timestamp)
                                    else:
                                        docman.upsert(doc, ns, timestamp)
                                    upsert_inc += 1

                                # Update
                                elif operation == 'u':
                                    docman.update(entry['o2']['_id'],
                                                  entry['o'],
                                                  ns, timestamp)
                                    update_inc += 1

                                # Command
                                elif operation == 'c':
                                    # use unmapped namespace
                                    doc = entry.get('o')
                                    docman.handle_command(doc,
                                                          entry['ns'],
                                                          timestamp)

                            except errors.OperationFailed:
                                LOG.exception(
                                    "Unable to process oplog document %r"
                                    % entry)
                            except errors.ConnectionFailed:
                                LOG.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            LOG.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far" % (
                                    remove_inc, upsert_inc, update_inc))

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug("OplogThread: updating checkpoint after"
                                  "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                LOG.exception(
                    "Cursor closed due to an exception. "
                    "Will attempt to reconnect.")

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug("OplogThread: updating checkpoint after an "
                          "Exception, cursor closing, or join() on this"
                          "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            LOG.debug("OplogThread: Sleeping. Documents removed: %d, "
                      "upserted: %d, updated: %d"
                      % (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
예제 #37
0
    def run(self):
        """Start the oplog worker.
        """
        while self.running is True:
            cursor = self.init_cursor()

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogManager: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                time.sleep(1)
                continue

            last_ts = None
            err = False
            try:
                while cursor.alive and self.running:
                    for entry in cursor:
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        #check if ns is excluded or not.
                        #also ensure non-empty namespace set.
                        if ns not in self.namespace_set and self.namespace_set:
                            continue

                        #delete
                        if operation == 'd':
                            entry['_id'] = entry['o']['_id']
                            self.doc_manager.remove(entry)
                        #insert/update. They are equal because of lack of support
                        #for partial update
                        elif operation == 'i' or operation == 'u':
                            doc = self.retrieve_doc(entry)
                            if doc is not None:
                                doc['_ts'] = util.bson_ts_to_long(entry['ts'])
                                doc['ns'] = ns
                                try:
                                    self.doc_manager.upsert(doc)
                                except errors.OperationFailed:
                                    logging.error("Unable to insert %s" % (doc))

                        last_ts = entry['ts']
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            if last_ts is not None:
                self.checkpoint = last_ts
                self.update_checkpoint()

            time.sleep(2)
예제 #38
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return cursor, retry_until_ok(cursor.count)

        self.checkpoint = timestamp
        self.update_checkpoint()

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_len = retry_until_ok(cursor.count)

            if cursor_len == 0:
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = retry_until_ok(next, cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the cursor.count() call and now
                time.sleep(1)
                continue

            # first entry should be last oplog entry processed
            cursor_ts_long = util.bson_ts_to_long(
                first_oplog_entry.get("ts"))
            given_ts_long = util.bson_ts_to_long(timestamp)
            if cursor_ts_long > given_ts_long:
                # first entry in oplog is beyond timestamp
                # we've fallen behind
                return None, 0

            # first entry has been consumed
            return cursor, cursor_len - 1

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
예제 #39
0
    def test_rollback(self):
        """Test rollback in oplog_manager. Assertion failure if it doesn't pass
            We force a rollback by inserting a doc, killing primary, inserting
            another doc, killing the new primary, and then restarting both
            servers.
        """

        os.system('rm %s; touch %s' % (CONFIG, CONFIG))
        if not start_cluster(sharded=True):
            self.fail("Shards cannot be added to mongos")

        test_oplog, primary_conn, solr, mongos = self.get_new_oplog()

        solr = DocManager()
        test_oplog.doc_manager = solr
        solr._delete()  # equivalent to solr.delete(q='*:*')

        safe_mongo_op(mongos['alpha']['foo'].remove, {})
        safe_mongo_op(mongos['alpha']['foo'].insert, {
            '_id': ObjectId('4ff74db3f646462b38000001'),
            'name': 'paulie'
        })
        cutoff_ts = test_oplog.get_last_oplog_timestamp()

        obj2 = ObjectId('4ff74db3f646462b38000002')
        first_doc = {
            'name': 'paulie',
            '_ts': bson_ts_to_long(cutoff_ts),
            'ns': 'alpha.foo',
            '_id': ObjectId('4ff74db3f646462b38000001')
        }

        # try kill one, try restarting
        kill_mongo_proc(primary_conn.host, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin_db = new_primary_conn['admin']
        while admin_db.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                mongos['alpha']['foo'].insert({'_id': obj2, 'name': 'paul'})
                break
            except OperationFailure:
                time.sleep(1)
                count += 1
                if count > 60:
                    self.fail('Insert failed too many times in rollback')
                continue

        kill_mongo_proc(primary_conn.host, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                         "/replset1a.log", None)

        # wait for master to be established
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                         "/replset1b.log", None)

        # wait for secondary to be established
        admin_db = new_primary_conn['admin']
        while admin_db.command("replSetGetStatus")['myState'] != 2:
            time.sleep(1)

        while retry_until_ok(mongos['alpha']['foo'].find().count) != 1:
            time.sleep(1)

        self.assertEqual(str(new_primary_conn.port), PORTS_ONE['SECONDARY'])
        self.assertEqual(str(primary_conn.port), PORTS_ONE['PRIMARY'])

        last_ts = test_oplog.get_last_oplog_timestamp()
        second_doc = {
            'name': 'paul',
            '_ts': bson_ts_to_long(last_ts),
            'ns': 'alpha.foo',
            '_id': obj2
        }

        test_oplog.doc_manager.upsert(first_doc)
        test_oplog.doc_manager.upsert(second_doc)
        test_oplog.rollback()
        test_oplog.doc_manager.commit()
        results = solr._search()

        self.assertEqual(len(results), 1)

        results_doc = results[0]
        self.assertEqual(results_doc['name'], 'paulie')
        self.assertTrue(results_doc['_ts'] <= bson_ts_to_long(cutoff_ts))
예제 #40
0
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error("%s %s %s" % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug(
                    "OplogThread: Last entry is the one we "
                    "already processed.  Up to date.  Sleeping."
                )
                time.sleep(1)
                continue

            last_ts = None
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug(
                        "OplogThread: Cursor is still"
                        " alive and thread is still running."
                    )
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug(
                            "OplogThread: Iterating through cursor,"
                            " document number in this cursor is %d" % n
                        )

                        # Sync the current oplog operation
                        timestamp = util.bson_ts_to_long(entry["ts"])

                        ts_entries = self._get_transaction_entries(entry)
                        if ts_entries:
                            LOG.debug("OplogThread: transaction entries: %s" % ts_entries)
                            for ts_entry in ts_entries:
                                skip, is_gridfs_file = self._should_skip_entry(ts_entry)
                                if skip:
                                    continue

                                self._process_with_doc_managers(ts_entry, timestamp, is_gridfs_file)
                        else:
                            skip, is_gridfs_file = self._should_skip_entry(entry)
                            if skip:
                                # update the last_ts on skipped entries to ensure
                                # our checkpoint does not fall off the oplog. This
                                # also prevents reprocessing skipped entries.
                                last_ts = entry["ts"]
                                continue

                            self._process_with_doc_managers(entry, timestamp, is_gridfs_file)

                        if (self.remove_inc + self.upsert_inc + self.update_inc) % 1000 == 0:
                            LOG.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far"
                                % (self.remove_inc, self.upsert_inc, self.update_inc)
                            )

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry["ts"]

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1:
                            self.update_checkpoint(last_ts)
                            last_ts = None

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug(
                            "OplogThread: updating checkpoint after "
                            "processing new oplog entries"
                        )
                        self.update_checkpoint(last_ts)

            except (
                    pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError,
            ):
                LOG.exception(
                    "Cursor closed due to an exception. " "Will attempt to reconnect."
                )

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug(
                    "OplogThread: updating checkpoint after an "
                    "Exception, cursor closing, or join() on this"
                    "thread."
                )
                self.update_checkpoint(last_ts)

            LOG.debug(
                "OplogThread: Sleeping. Documents removed: %d, "
                "upserted: %d, updated: %d" % (self.remove_inc, self.upsert_inc, self.update_inc)
            )
            time.sleep(2)
예제 #41
0
def write_ts_file(ts_file, rs, ts):
    ts_long = util.bson_ts_to_long(ts)
    content = json.dumps([rs, ts_long])
    open(ts_file, "w").write(content)
예제 #42
0
    def get_oplog_cursor(self, timestamp):
        """Move cursor to the proper place in the oplog.
        """

        logging.debug("OplogThread: Getting the oplog cursor and moving it "
                      "to the proper place in the oplog.")

        if timestamp is None:
            return None

        cursor, cursor_len = None, 0
        while (True):
            try:
                logging.debug("OplogThread: Getting the oplog cursor "
                              "in the while true loop for get_oplog_cursor")
                if not self.namespace_set:
                    cursor = self.oplog.find(
                        {'ts': {'$gte': timestamp}},
                        tailable=True, await_data=True
                    )
                else:
                    cursor = self.oplog.find(
                        {'ts': {'$gte': timestamp},
                         'ns': {'$in': self.namespace_set}},
                        tailable=True, await_data=True
                    )
                # Applying 8 as the mask to the cursor enables OplogReplay
                cursor.add_option(8)
                logging.debug("OplogThread: Cursor created, getting a count.")
                cursor_len = cursor.count()
                logging.debug("OplogThread: Count is %d" % cursor_len)
                break
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                pass
        if cursor_len == 0:
            logging.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
            #rollback, we are past the last element in the oplog
            timestamp = self.rollback()

            logging.info('Finished rollback')
            return self.get_oplog_cursor(timestamp)
        first_oplog_entry = retry_until_ok(lambda: cursor[0])
        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry.get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None
        elif cursor_len == 1:     # means we are the end of the oplog
            self.checkpoint = timestamp
            #to commit new TS after rollbacks

            return cursor
        elif cursor_len > 1:
            doc = retry_until_ok(next, cursor)
            if timestamp == doc['ts']:
                return cursor
            else:               # error condition
                logging.error('OplogThread: %s Bad timestamp in config file'
                              % self.oplog)
                return None
예제 #43
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        # Find the most recently inserted document in each target system
        LOG.debug(
            "OplogThread: Initiating rollback sequence to bring "
            "system into a consistent state."
        )
        last_docs = []
        for dm in self.doc_managers:
            dm.commit()
            last_docs.append(dm.get_last_doc())

        # Of these documents, which is the most recent?
        last_inserted_doc = max(
            last_docs, key=lambda x: x["_ts"] if x else float("-inf")
        )

        # Nothing has been replicated. No need to rollback target systems
        if last_inserted_doc is None:
            return None

        # Find the oplog entry that touched the most recent document.
        # We'll use this to figure where to pick up the oplog later.
        target_ts = util.long_to_bson_ts(last_inserted_doc["_ts"])
        last_oplog_entry = util.retry_until_ok(
            self.oplog.find_one,
            {"ts": {"$lte": target_ts}, "op": {"$ne": "n"}},
            sort=[("$natural", pymongo.DESCENDING)],
        )

        LOG.debug("OplogThread: last oplog entry is %s" % str(last_oplog_entry))

        # The oplog entry for the most recent document doesn't exist anymore.
        # If we've fallen behind in the oplog, this will be caught later
        if last_oplog_entry is None:
            return None

        # rollback_cutoff_ts happened *before* the rollback
        rollback_cutoff_ts = last_oplog_entry["ts"]
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        # timestamp of the most recent document on any target system
        end_ts = last_inserted_doc["_ts"]

        for dm in self.doc_managers:
            rollback_set = {}  # this is a dictionary of ns:list of docs

            # group potentially conflicted documents by namespace
            for doc in dm.search(start_ts, end_ts):
                if doc["ns"] in rollback_set:
                    rollback_set[doc["ns"]].append(doc)
                else:
                    rollback_set[doc["ns"]] = [doc]

            # retrieve these documents from MongoDB, either updating
            # or removing them in each target system
            for namespace, doc_list in rollback_set.items():
                # Get the original namespace
                original_namespace = self.namespace_config.unmap_namespace(namespace)
                if not original_namespace:
                    original_namespace = namespace

                database, coll = original_namespace.split(".", 1)
                obj_id = bson.objectid.ObjectId
                bson_obj_id_list = [obj_id(doc["_id"]) for doc in doc_list]

                # Use connection to whole cluster if in sharded environment.
                client = self.mongos_client or self.primary_client
                to_update = util.retry_until_ok(
                    client[database][coll].find,
                    {"_id": {"$in": bson_obj_id_list}},
                    projection=self.namespace_config.projection(original_namespace),
                )
                # Doc list are docs in target system, to_update are
                # Docs in mongo
                doc_hash = {}  # Hash by _id
                for doc in doc_list:
                    doc_hash[bson.objectid.ObjectId(doc["_id"])] = doc

                to_index = []

                def collect_existing_docs():
                    for doc in to_update:
                        if doc["_id"] in doc_hash:
                            del doc_hash[doc["_id"]]
                            to_index.append(doc)

                retry_until_ok(collect_existing_docs)

                # Delete the inconsistent documents
                LOG.debug("OplogThread: Rollback, removing inconsistent " "docs.")
                remov_inc = 0
                for document_id in doc_hash:
                    try:
                        dm.remove(
                            document_id,
                            namespace,
                            util.bson_ts_to_long(rollback_cutoff_ts),
                        )
                        remov_inc += 1
                        LOG.debug("OplogThread: Rollback, removed %r " % doc)
                    except errors.OperationFailed:
                        LOG.warning(
                            "Could not delete document during rollback: %r "
                            "This can happen if this document was already "
                            "removed by another rollback happening at the "
                            "same time." % doc
                        )

                LOG.debug("OplogThread: Rollback, removed %d docs." % remov_inc)

                # Insert the ones from mongo
                LOG.debug("OplogThread: Rollback, inserting documents " "from mongo.")
                insert_inc = 0
                fail_insert_inc = 0
                for doc in to_index:
                    try:
                        insert_inc += 1
                        dm.upsert(
                            doc, namespace, util.bson_ts_to_long(rollback_cutoff_ts)
                        )
                    except errors.OperationFailed:
                        fail_insert_inc += 1
                        LOG.exception(
                            "OplogThread: Rollback, Unable to " "insert %r" % doc
                        )

        LOG.debug(
            "OplogThread: Rollback, Successfully inserted %d "
            " documents and failed to insert %d"
            " documents.  Returning a rollback cutoff time of %s "
            % (insert_inc, fail_insert_inc, str(rollback_cutoff_ts))
        )

        return rollback_cutoff_ts
예제 #44
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        # Find the most recently inserted document in each target system
        logging.debug("OplogThread: Initiating rollback sequence to bring "
                      "system into a consistent state.")
        last_docs = []
        for dm in self.doc_managers:
            dm.commit()
            last_docs.append(dm.get_last_doc())

        # Of these documents, which is the most recent?
        last_inserted_doc = max(last_docs,
                                key=lambda x: x["_ts"] if x else float("-inf"))

        # Nothing has been replicated. No need to rollback target systems
        if last_inserted_doc is None:
            return None

        # Find the oplog entry that touched the most recent document.
        # We'll use this to figure where to pick up the oplog later.
        target_ts = util.long_to_bson_ts(last_inserted_doc['_ts'])
        last_oplog_entry = util.retry_until_ok(
            self.oplog.find_one,
            {'ts': {'$lte': target_ts}},
            sort=[('$natural', pymongo.DESCENDING)]
        )

        logging.debug("OplogThread: last oplog entry is %s"
                      % str(last_oplog_entry))

        # The oplog entry for the most recent document doesn't exist anymore.
        # If we've fallen behind in the oplog, this will be caught later
        if last_oplog_entry is None:
            return None

        # rollback_cutoff_ts happened *before* the rollback
        rollback_cutoff_ts = last_oplog_entry['ts']
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        # timestamp of the most recent document on any target system
        end_ts = last_inserted_doc['_ts']

        for dm in self.doc_managers:
            rollback_set = {}   # this is a dictionary of ns:list of docs

            # group potentially conflicted documents by namespace
            for doc in dm.search(start_ts, end_ts):
                if doc['ns'] in rollback_set:
                    rollback_set[doc['ns']].append(doc)
                else:
                    rollback_set[doc['ns']] = [doc]

            # retrieve these documents from MongoDB, either updating
            # or removing them in each target system
            for namespace, doc_list in rollback_set.items():
                # Get the original namespace
                original_namespace = namespace
                for source_name, dest_name in self.dest_mapping.items():
                    if dest_name == namespace:
                        original_namespace = source_name

                database, coll = original_namespace.split('.', 1)
                obj_id = bson.objectid.ObjectId
                bson_obj_id_list = [obj_id(doc['_id']) for doc in doc_list]

                to_update = util.retry_until_ok(
                    self.main_connection[database][coll].find,
                    {'_id': {'$in': bson_obj_id_list}},
                    fields=self._fields
                )
                #doc list are docs in target system, to_update are
                #docs in mongo
                doc_hash = {}  # hash by _id
                for doc in doc_list:
                    doc_hash[bson.objectid.ObjectId(doc['_id'])] = doc

                to_index = []

                def collect_existing_docs():
                    for doc in to_update:
                        if doc['_id'] in doc_hash:
                            del doc_hash[doc['_id']]
                            to_index.append(doc)
                retry_until_ok(collect_existing_docs)

                #delete the inconsistent documents
                logging.debug("OplogThread: Rollback, removing inconsistent "
                              "docs.")
                remov_inc = 0
                for doc in doc_hash.values():
                    try:
                        dm.remove(doc)
                        remov_inc += 1
                        logging.debug("OplogThread: Rollback, removed %s " %
                                      str(doc))
                    except errors.OperationFailed:
                        logging.warning(
                            "Could not delete document during rollback: %s "
                            "This can happen if this document was already "
                            "removed by another rollback happening at the "
                            "same time." % str(doc)
                        )

                logging.debug("OplogThread: Rollback, removed %d docs." %
                              remov_inc)

                #insert the ones from mongo
                logging.debug("OplogThread: Rollback, inserting documents "
                              "from mongo.")
                insert_inc = 0
                fail_insert_inc = 0
                for doc in to_index:
                    doc['_ts'] = util.bson_ts_to_long(rollback_cutoff_ts)
                    doc['ns'] = self.dest_mapping.get(namespace, namespace)
                    try:
                        insert_inc += 1
                        dm.upsert(doc)
                    except errors.OperationFailed as e:
                        fail_insert_inc += 1
                        logging.error("OplogThread: Rollback, Unable to "
                                      "insert %s with exception %s"
                                      % (doc, str(e)))

        logging.debug("OplogThread: Rollback, Successfully inserted %d "
                      " documents and failed to insert %d"
                      " documents.  Returning a rollback cutoff time of %s "
                      % (insert_inc, fail_insert_inc, str(rollback_cutoff_ts)))

        return rollback_cutoff_ts
    def test_rollback(self):
        """Test rollback in oplog_manager. Assertion failure if it doesn't pass
            We force a rollback by inserting a doc, killing primary, inserting
            another doc, killing the new primary, and then restarting both
            servers.
        """

        os.system('rm %s; touch %s' % (CONFIG, CONFIG))
        if not start_cluster(sharded=True):
            self.fail("Shards cannot be added to mongos")

        test_oplog, primary_conn, solr, mongos = self.get_new_oplog()

        solr = DocManager()
        test_oplog.doc_manager = solr
        solr._delete()          # equivalent to solr.delete(q='*:*')

        safe_mongo_op(mongos['alpha']['foo'].remove, {})
        safe_mongo_op(mongos['alpha']['foo'].insert,
                      {'_id': ObjectId('4ff74db3f646462b38000001'),
                      'name': 'paulie'})
        cutoff_ts = test_oplog.get_last_oplog_timestamp()

        obj2 = ObjectId('4ff74db3f646462b38000002')
        first_doc = {'name': 'paulie', '_ts': bson_ts_to_long(cutoff_ts),
                     'ns': 'alpha.foo', 
                     '_id': ObjectId('4ff74db3f646462b38000001')}

        # try kill one, try restarting
        kill_mongo_proc(primary_conn.host, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin_db = new_primary_conn['admin']
        while admin_db.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                mongos['alpha']['foo'].insert({'_id': obj2, 'name': 'paul'})
                break
            except OperationFailure:
                time.sleep(1)
                count += 1
                if count > 60:
                    self.fail('Insert failed too many times in rollback')
                continue

        kill_mongo_proc(primary_conn.host, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)

        # wait for master to be established
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        # wait for secondary to be established
        admin_db = new_primary_conn['admin']
        while admin_db.command("replSetGetStatus")['myState'] != 2:
            time.sleep(1)

        while retry_until_ok(mongos['alpha']['foo'].find().count) != 1:
            time.sleep(1)

        self.assertEqual(str(new_primary_conn.port), PORTS_ONE['SECONDARY'])
        self.assertEqual(str(primary_conn.port), PORTS_ONE['PRIMARY'])

        last_ts = test_oplog.get_last_oplog_timestamp()
        second_doc = {'name': 'paul', '_ts': bson_ts_to_long(last_ts),
                      'ns': 'alpha.foo', '_id': obj2}

        test_oplog.doc_manager.upsert(first_doc)
        test_oplog.doc_manager.upsert(second_doc)
        test_oplog.rollback()
        test_oplog.doc_manager.commit()
        results = solr._search()

        self.assertEqual(len(results), 1)

        results_doc = results[0]
        self.assertEqual(results_doc['name'], 'paulie')
        self.assertTrue(results_doc['_ts'] <= bson_ts_to_long(cutoff_ts))
예제 #46
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        timestamp = retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)
        # Flag if this oplog thread was cancelled during the collection dump.
        # Use a list to workaround python scoping.
        dump_cancelled = [False]

        def get_all_ns():
            ns_set = []
            gridfs_ns_set = []
            db_list = self.namespace_config.get_included_databases()
            if not db_list:
                # Only use listDatabases when the configured databases are not
                # explicit.
                db_list = retry_until_ok(self.primary_client.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.primary_client[database].collection_names
                )
                for coll in coll_list:
                    # ignore system collections
                    if coll.startswith("system."):
                        continue
                    # ignore gridfs chunks collections
                    if coll.endswith(".chunks"):
                        continue
                    if coll.endswith(".files"):
                        namespace = "%s.%s" % (database, coll)
                        namespace = namespace[: -len(".files")]
                        if self.namespace_config.gridfs_namespace(namespace):
                            gridfs_ns_set.append(namespace)
                    else:
                        namespace = "%s.%s" % (database, coll)
                        if self.namespace_config.map_namespace(namespace):
                            ns_set.append(namespace)
            return ns_set, gridfs_ns_set

        dump_set, gridfs_dump_set = get_all_ns()

        LOG.debug("OplogThread: Dumping set of collections %s " % dump_set)

        def docs_to_dump(from_coll):
            last_id = None
            attempts = 0
            projection = self.namespace_config.projection(from_coll.full_name)
            # Loop to handle possible AutoReconnect
            while attempts < 60:
                if last_id is None:
                    cursor = retry_until_ok(
                        from_coll.find,
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)],
                    )
                else:
                    cursor = retry_until_ok(
                        from_coll.find,
                        {"_id": {"$gt": last_id}},
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)],
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            # Thread was joined while performing the
                            # collection dump.
                            dump_cancelled[0] = True
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect, pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_each(dm):
            num_failed = 0
            for namespace in dump_set:
                from_coll = self.get_collection(namespace)
                mapped_ns = self.namespace_config.map_namespace(namespace)
                total_docs = retry_until_ok(from_coll.count)
                num = None
                for num, doc in enumerate(docs_to_dump(from_coll)):
                    try:
                        dm.upsert(doc, mapped_ns, long_ts)
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception("Could not upsert document: %r" % doc)
                            num_failed += 1
                        else:
                            raise
                    if num % 10000 == 0:
                        LOG.info(
                            "Upserted %d out of approximately %d docs "
                            "from collection '%s'",
                            num + 1,
                            total_docs,
                            namespace,
                        )
                if num is not None:
                    LOG.info(
                        "Upserted %d out of approximately %d docs from "
                        "collection '%s'",
                        num + 1,
                        total_docs,
                        namespace,
                    )
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    from_coll = self.get_collection(namespace)
                    total_docs = retry_until_ok(from_coll.count)
                    mapped_ns = self.namespace_config.map_namespace(namespace)
                    LOG.info(
                        "Bulk upserting approximately %d docs from " "collection '%s'",
                        total_docs,
                        namespace,
                    )
                    dm.bulk_upsert(docs_to_dump(from_coll), mapped_ns, long_ts)
            except Exception:
                if self.continue_on_error:
                    LOG.exception(
                        "OplogThread: caught exception"
                        " during bulk upsert, re-upserting"
                        " documents serially"
                    )
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                LOG.debug(
                    "OplogThread: Using bulk upsert function for " "collection dump"
                )
                upsert_all(dm)

                if gridfs_dump_set:
                    LOG.info(
                        "OplogThread: dumping GridFS collections: %s", gridfs_dump_set
                    )

                # Dump GridFS files
                for gridfs_ns in gridfs_dump_set:
                    mongo_coll = self.get_collection(gridfs_ns)
                    from_coll = self.get_collection(gridfs_ns + ".files")
                    dest_ns = self.namespace_config.map_namespace(gridfs_ns)
                    for doc in docs_to_dump(from_coll):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except Exception:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical(
                    "Exception during collection dump", exc_info=errors.get_nowait()
                )
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error("%s %s %s" % (err_msg, effect, self.oplog))
            self.running = False
            return None

        if dump_cancelled[0]:
            LOG.warning(
                "Initial collection dump was interrupted. "
                "Will re-run the collection dump on next startup."
            )
            return None

        return timestamp
예제 #47
0
#!/usr/bin/python
import pymongo
import sys
from mongo_connector import util

mongo_url = "mongodb://localhost:27017"
if len(sys.argv) == 1:
    print("First argument is mongodb connection string, i.e. "
          "localhost:27017. Assuming localhost:27017...")
if len(sys.argv) >= 2:
    mongo_url = sys.argv[1]

client = pymongo.MongoClient(mongo_url)
rs_name = client.admin.command("ismaster")["setName"]
print("Found Replica Set name: {}".format(str(rs_name)))

print("Now checking for the latest oplog entry...")
oplog = client.local.oplog.rs
last_oplog = oplog.find().sort("$natural", pymongo.DESCENDING).limit(-1).next()
print("Found the last oplog ts: {}".format(str(last_oplog["ts"])))
last_ts = util.bson_ts_to_long(last_oplog["ts"])
out_str = '["{}", {}]'.format(str(rs_name), str(last_ts))

print("Writing all to file oplog.timestamp.last in the format "
      "required for mongo-connector")
f = open("./oplog.timestamp.last", "w")
f.write(out_str)
f.close()
print("All done!")
예제 #48
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.get_dump_set()
        shard_doc_count = self.shard_doc_count()
        timestamp = None

        LOG.info("OplogThread: Dumping set of collections %s from shard %s with %d docs" % (dump_set, self.shard_name, shard_doc_count))
        if shard_doc_count > 0:
            timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
            long_ts = util.bson_ts_to_long(timestamp)
        else:
            long_ts = 0

        def query_handle_id(query):
            if '_id' in query and query['_id']:
                for key in query['_id']:
                    query['_id'][key] = ObjectId(query['_id'][key])

        def docs_to_dump(namespace):
            database, coll = namespace.split('.', 1)
            last_id = None
            attempts = 0

            # Loop to handle possible AutoReconnect
            while attempts < 60:
                target_coll = self.primary_client[database][coll]
                fields_to_fetch = None
                if 'include' in self._fields:
                    fields_to_fetch = self._fields['include']
                query = {}
                if self.initial_import['query']:
                    query_handle_id(self.initial_import['query'])
                    query = self.initial_import['query']
                if not last_id:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        query,
                        fields=fields_to_fetch,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                else:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        query.update({"_id": {"$gt": last_id}}),
                        fields=fields_to_fetch,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            LOG.error("Stopped iterating over Cursor while initial import")
                            raise StopIteration
                        self.pop_excluded_fields(doc)
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical('Exception during collection dump',
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None
        else:
            LOG.info('OplogThread: Successfully dumped collection')
            LOG.warning('Fields with errors: %r' % self.error_fields)
            if not self.reimporting:
                LOG.info("Waiting for other threads to finish collection dump")
                self.barrier.wait()

        return timestamp
예제 #49
0
    def run(self):
        """Start the oplog worker.
        """
        while self.running is True:
            cursor = self.init_cursor()

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogManager: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                time.sleep(1)
                continue

            last_ts = None
            err = False
            try:
                while cursor.alive and self.running:
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        #check if ns is excluded or not.
                        #also ensure non-empty namespace set.
                        if (ns not in self.namespace_set
                                and self.namespace_set):
                            continue

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        dm.upsert(self.filter_fields(doc))
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" % (
                                    "delete" if operation == "d" else "upsert",
                                    str(entry['_id'])
                                ))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" % (
                                    "delete" if operation == "d" else "upsert",
                                    str(entry['_id'])
                                ))

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                self.checkpoint = last_ts
                self.update_checkpoint()

            time.sleep(2)