def do_dump(dm, error_queue): try: LOG.debug( "OplogThread: Using bulk upsert function for " "collection dump" ) upsert_all(dm) if gridfs_dump_set: LOG.info( "OplogThread: dumping GridFS collections: %s", gridfs_dump_set ) # Dump GridFS files for gridfs_ns in gridfs_dump_set: mongo_coll = self.get_collection(gridfs_ns) from_coll = self.get_collection(gridfs_ns + ".files") dest_ns = self.namespace_config.map_namespace(gridfs_ns) for doc in docs_to_dump(from_coll): gridfile = GridFSFile(mongo_coll, doc) dm.insert_file(gridfile, dest_ns, long_ts) except Exception: # Likely exceptions: # pymongo.errors.OperationFailure, # mongo_connector.errors.ConnectionFailed # mongo_connector.errors.OperationFailed error_queue.put(sys.exc_info())
def do_dump(dm, error_queue): try: # Dump the documents, bulk upsert if possible if hasattr(dm, "bulk_upsert"): LOG.debug("OplogThread: Using bulk upsert function for " "collection dump") upsert_all(dm) else: LOG.debug( "OplogThread: DocManager %s has no " "bulk_upsert method. Upserting documents " "serially for collection dump." % str(dm)) upsert_each(dm) # Dump GridFS files for gridfs_ns in self.gridfs_set: db, coll = gridfs_ns.split('.', 1) mongo_coll = self.primary_client[db][coll] dest_ns = self.dest_mapping.get(gridfs_ns, gridfs_ns) for doc in docs_to_dump(gridfs_ns + '.files'): gridfile = GridFSFile(mongo_coll, doc) dm.insert_file(gridfile, dest_ns, long_ts) except: # Likely exceptions: # pymongo.errors.OperationFailure, # mongo_connector.errors.ConnectionFailed # mongo_connector.errors.OperationFailed error_queue.put(sys.exc_info())
def _process_with_doc_managers(self, entry, timestamp, is_gridfs_file): operation = entry["op"] ns = entry["ns"] for docman in self.doc_managers: try: LOG.debug( "OplogThread: Operation for this " "entry is %s" % str(operation) ) # Remove if operation == "d": docman.remove(entry["o"]["_id"], ns, timestamp) self.remove_inc += 1 # Insert elif operation == "i": # Insert # Retrieve inserted document from # 'o' field in oplog record doc = entry.get("o") # Extract timestamp and namespace if is_gridfs_file: db, coll = ns.split(".", 1) gridfile = GridFSFile( self.primary_client[db][coll], doc ) docman.insert_file(gridfile, ns, timestamp) else: docman.upsert(doc, ns, timestamp) self.upsert_inc += 1 # Update elif operation == "u": docman.update( entry["o2"]["_id"], entry["o"], ns, timestamp ) self.update_inc += 1 # Command elif operation == "c": # use unmapped namespace doc = entry.get("o") docman.handle_command(doc, entry["ns"], timestamp) except errors.OperationFailed: LOG.exception( "Unable to process oplog document %r" % entry ) except errors.ConnectionFailed: LOG.exception( "Connection failed while processing oplog " "document %r" % entry )
def do_dump(dm, error_queue): try: LOG.debug("OplogThread: Using bulk upsert function for " "collection dump") upsert_all(dm) # Dump GridFS files for gridfs_ns in self.gridfs_set: mongo_coll = self.get_collection(gridfs_ns) from_coll = self.get_collection(gridfs_ns + '.files') dest_ns = self.dest_mapping_stru.get(gridfs_ns, gridfs_ns) for doc in docs_to_dump(from_coll): gridfile = GridFSFile(mongo_coll, doc) dm.insert_file(gridfile, dest_ns, long_ts) except: # Likely exceptions: # pymongo.errors.OperationFailure, # mongo_connector.errors.ConnectionFailed # mongo_connector.errors.OperationFailed error_queue.put(sys.exc_info())
def run(self): """Start the oplog worker. """ ReplicationLagLogger(self, 30).start() LOG.debug("OplogThread: Run thread started") while self.running is True: LOG.debug("OplogThread: Getting cursor") cursor, cursor_empty = retry_until_ok(self.init_cursor) # we've fallen too far behind if cursor is None and self.checkpoint is not None: err_msg = "OplogThread: Last entry no longer in oplog" effect = "cannot recover!" LOG.error("%s %s %s" % (err_msg, effect, self.oplog)) self.running = False continue if cursor_empty: LOG.debug( "OplogThread: Last entry is the one we " "already processed. Up to date. Sleeping." ) time.sleep(1) continue last_ts = None remove_inc = 0 upsert_inc = 0 update_inc = 0 try: LOG.debug("OplogThread: about to process new oplog entries") while cursor.alive and self.running: LOG.debug( "OplogThread: Cursor is still" " alive and thread is still running." ) for n, entry in enumerate(cursor): # Break out if this thread should stop if not self.running: break LOG.debug( "OplogThread: Iterating through cursor," " document number in this cursor is %d" % n ) skip, is_gridfs_file = self._should_skip_entry(entry) if skip: # update the last_ts on skipped entries to ensure # our checkpoint does not fall off the oplog. This # also prevents reprocessing skipped entries. last_ts = entry["ts"] continue # Sync the current oplog operation operation = entry["op"] ns = entry["ns"] timestamp = util.bson_ts_to_long(entry["ts"]) for docman in self.doc_managers: try: LOG.debug( "OplogThread: Operation for this " "entry is %s" % str(operation) ) # Remove if operation == "d": docman.remove(entry["o"]["_id"], ns, timestamp) remove_inc += 1 # Insert elif operation == "i": # Insert # Retrieve inserted document from # 'o' field in oplog record doc = entry.get("o") # Extract timestamp and namespace if is_gridfs_file: db, coll = ns.split(".", 1) gridfile = GridFSFile( self.primary_client[db][coll], doc ) docman.insert_file(gridfile, ns, timestamp) else: docman.upsert(doc, ns, timestamp) upsert_inc += 1 # Update elif operation == "u": docman.update( entry["o2"]["_id"], entry["o"], ns, timestamp ) update_inc += 1 # Command elif operation == "c": # use unmapped namespace doc = entry.get("o") docman.handle_command(doc, entry["ns"], timestamp) except errors.OperationFailed: LOG.exception( "Unable to process oplog document %r" % entry ) except errors.ConnectionFailed: LOG.exception( "Connection failed while processing oplog " "document %r" % entry ) if (remove_inc + upsert_inc + update_inc) % 1000 == 0: LOG.debug( "OplogThread: Documents removed: %d, " "inserted: %d, updated: %d so far" % (remove_inc, upsert_inc, update_inc) ) LOG.debug("OplogThread: Doc is processed.") last_ts = entry["ts"] # update timestamp per batch size # n % -1 (default for self.batch_size) == 0 for all n if n % self.batch_size == 1: self.update_checkpoint(last_ts) last_ts = None # update timestamp after running through oplog if last_ts is not None: LOG.debug( "OplogThread: updating checkpoint after " "processing new oplog entries" ) self.update_checkpoint(last_ts) except ( pymongo.errors.AutoReconnect, pymongo.errors.OperationFailure, pymongo.errors.ConfigurationError, ): LOG.exception( "Cursor closed due to an exception. " "Will attempt to reconnect." ) # update timestamp before attempting to reconnect to MongoDB, # after being join()'ed, or if the cursor closes if last_ts is not None: LOG.debug( "OplogThread: updating checkpoint after an " "Exception, cursor closing, or join() on this" "thread." ) self.update_checkpoint(last_ts) LOG.debug( "OplogThread: Sleeping. Documents removed: %d, " "upserted: %d, updated: %d" % (remove_inc, upsert_inc, update_inc) ) time.sleep(2)
def get_file(self, doc): return GridFSFile(self.collection, doc)
def run(self): """Start the oplog worker. """ LOG.debug("OplogThread: Run thread started") while self.running is True: LOG.debug("OplogThread: Getting cursor") cursor, cursor_len = self.init_cursor() # we've fallen too far behind if cursor is None and self.checkpoint is not None: err_msg = "OplogThread: Last entry no longer in oplog" effect = "cannot recover!" LOG.error('%s %s %s' % (err_msg, effect, self.oplog)) self.running = False continue if cursor_len == 0: LOG.debug("OplogThread: Last entry is the one we " "already processed. Up to date. Sleeping.") time.sleep(1) continue LOG.debug("OplogThread: Got the cursor, count is %d" % cursor_len) last_ts = None remove_inc = 0 upsert_inc = 0 update_inc = 0 try: LOG.debug("OplogThread: about to process new oplog " "entries") while cursor.alive and self.running: LOG.debug("OplogThread: Cursor is still" " alive and thread is still running.") for n, entry in enumerate(cursor): LOG.debug("OplogThread: Iterating through cursor," " document number in this cursor is %d" % n) # Break out if this thread should stop if not self.running: break # Don't replicate entries resulting from chunk moves if entry.get("fromMigrate"): continue # Take fields out of the oplog entry that # shouldn't be replicated. This may nullify # the document if there's nothing to do. if not self.filter_oplog_entry(entry): continue # Sync the current oplog operation operation = entry['op'] ns = entry['ns'] if '.' not in ns: continue coll = ns.split('.', 1)[1] # Ignore system collections if coll.startswith("system."): continue # Ignore GridFS chunks if coll.endswith('.chunks'): continue is_gridfs_file = False if coll.endswith(".files"): if ns in self.gridfs_files_set: ns = ns[:-len(".files")] is_gridfs_file = True else: continue # use namespace mapping if one exists ns = self.dest_mapping.get(ns, ns) timestamp = util.bson_ts_to_long(entry['ts']) for docman in self.doc_managers: try: LOG.debug("OplogThread: Operation for this " "entry is %s" % str(operation)) # Remove if operation == 'd': docman.remove( entry['o']['_id'], ns, timestamp) remove_inc += 1 # Insert elif operation == 'i': # Insert # Retrieve inserted document from # 'o' field in oplog record doc = entry.get('o') # Extract timestamp and namespace if is_gridfs_file: db, coll = ns.split('.', 1) gridfile = GridFSFile( self.primary_client[db][coll], doc) docman.insert_file( gridfile, ns, timestamp) else: docman.upsert(doc, ns, timestamp) upsert_inc += 1 # Update elif operation == 'u': docman.update(entry['o2']['_id'], entry['o'], ns, timestamp) update_inc += 1 # Command elif operation == 'c': # use unmapped namespace doc = entry.get('o') docman.handle_command(doc, entry['ns'], timestamp) except errors.OperationFailed: LOG.exception( "Unable to process oplog document %r" % entry) except errors.ConnectionFailed: LOG.exception( "Connection failed while processing oplog " "document %r" % entry) if (remove_inc + upsert_inc + update_inc) % 1000 == 0: LOG.debug( "OplogThread: Documents removed: %d, " "inserted: %d, updated: %d so far" % ( remove_inc, upsert_inc, update_inc)) LOG.debug("OplogThread: Doc is processed.") last_ts = entry['ts'] # update timestamp per batch size # n % -1 (default for self.batch_size) == 0 for all n if n % self.batch_size == 1 and last_ts is not None: self.checkpoint = last_ts self.update_checkpoint() # update timestamp after running through oplog if last_ts is not None: LOG.debug("OplogThread: updating checkpoint after" "processing new oplog entries") self.checkpoint = last_ts self.update_checkpoint() except (pymongo.errors.AutoReconnect, pymongo.errors.OperationFailure, pymongo.errors.ConfigurationError): LOG.exception( "Cursor closed due to an exception. " "Will attempt to reconnect.") # update timestamp before attempting to reconnect to MongoDB, # after being join()'ed, or if the cursor closes if last_ts is not None: LOG.debug("OplogThread: updating checkpoint after an " "Exception, cursor closing, or join() on this" "thread.") self.checkpoint = last_ts self.update_checkpoint() LOG.debug("OplogThread: Sleeping. Documents removed: %d, " "upserted: %d, updated: %d" % (remove_inc, upsert_inc, update_inc)) time.sleep(2)
def run(self): """Start the oplog worker. """ ReplicationLagLogger(self, 30).start() LOG.debug("OplogThread: Run thread started") while self.running is True: LOG.debug("OplogThread: Getting cursor") cursor, cursor_empty = retry_until_ok(self.init_cursor) # we've fallen too far behind if cursor is None and self.checkpoint is not None: err_msg = "OplogThread: Last entry no longer in oplog" effect = "cannot recover!" LOG.error('%s %s %s' % (err_msg, effect, self.oplog)) self.running = False continue if cursor_empty: LOG.debug("OplogThread: Last entry is the one we " "already processed. Up to date. Sleeping.") time.sleep(1) continue last_ts = None remove_inc = 0 upsert_inc = 0 update_inc = 0 try: LOG.debug("OplogThread: about to process new oplog entries") while cursor.alive and self.running: LOG.debug("OplogThread: Cursor is still" " alive and thread is still running.") for n, entry in enumerate(cursor): # Break out if this thread should stop if not self.running: break LOG.debug("OplogThread: Iterating through cursor," " document number in this cursor is %d" % n) skip, is_gridfs_file = self._should_skip_entry(entry) if skip: # update the last_ts on skipped entries to ensure # our checkpoint does not fall off the oplog. This # also prevents reprocessing skipped entries. last_ts = entry['ts'] continue op_add = 0 op_remove = 0 op_update = 0 # Sync the current oplog operation operation = entry['op'] ns = entry['ns'] timestamp = util.bson_ts_to_long(entry['ts']) for docman in self.doc_managers: @self.ERROR_TIME.time() def process_exception(metric): metric.inc() try: LOG.debug("OplogThread: Operation for this " "entry is %s" % str(operation)) # Remove if operation == 'd': docman.remove(entry['o']['_id'], ns, timestamp) remove_inc += 1 op_remove += 1 # Insert elif operation == 'i': # Insert # Retrieve inserted document from # 'o' field in oplog record doc = entry.get('o') # Extract timestamp and namespace if is_gridfs_file: db, coll = ns.split('.', 1) gridfile = GridFSFile( self.primary_client[db][coll], doc) docman.insert_file( gridfile, ns, timestamp) else: docman.upsert(doc, ns, timestamp) upsert_inc += 1 op_add += 1 # Update elif operation == 'u': docman.update(entry['o2']['_id'], entry['o'], ns, timestamp) update_inc += 1 op_update += 1 # Command elif operation == 'c': # use unmapped namespace doc = entry.get('o') docman.handle_command( doc, entry['ns'], timestamp) except errors.OperationFailed: # Remove if operation == 'd': if op_remove > 0: op_remove -= 1 # Insert elif operation == 'i': if op_add > 0: op_add -= 1 # Update elif operation == 'u': if op_update > 0: op_update -= 1 process_exception( self.error_caught.labels( 'cannot_process_doc', errors.OperationFailed)) LOG.exception( "Unable to process oplog document %r" % entry) except errors.ConnectionFailed: # Remove if operation == 'd': if op_remove > 0: op_remove -= 1 # Insert elif operation == 'i': if op_add > 0: op_add -= 1 # Update elif operation == 'u': if op_update > 0: op_update -= 1 process_exception( self.error_caught.labels( 'connection_failed', errors.ConnectionFailed)) LOG.exception( "Connection failed while processing oplog " "document %r" % entry) if (remove_inc + upsert_inc + update_inc) % 1000 == 0: LOG.debug("OplogThread: Documents removed: %d, " "inserted: %d, updated: %d so far" % (remove_inc, upsert_inc, update_inc)) LOG.debug("OplogThread: Doc is processed.") last_ts = entry['ts'] # update timestamp per batch size # n % -1 (default for self.batch_size) == 0 for all n if n % self.batch_size == 1: self.update_checkpoint(last_ts) last_ts = None LOG.always("Counter: Documents removed: %d, " "inserted: %d, updated: %d so far" % (op_remove, op_add, op_update)) # TODO: Add collection name as label @self.REQUEST_TIME.time() def process_request(add, remove, update): self.doc_operation_count.labels('add').inc(add) self.doc_operation_count.labels('remove').inc( remove) self.doc_operation_count.labels('update').inc( update) process_request(op_add, op_remove, op_update) # update timestamp after running through oplog if last_ts is not None: LOG.debug("OplogThread: updating checkpoint after " "processing new oplog entries") self.update_checkpoint(last_ts) except (pymongo.errors.AutoReconnect, pymongo.errors.OperationFailure, pymongo.errors.ConfigurationError): LOG.exception("Cursor closed due to an exception. " "Will attempt to reconnect.") # update timestamp before attempting to reconnect to MongoDB, # after being join()'ed, or if the cursor closes if last_ts is not None: LOG.debug("OplogThread: updating checkpoint after an " "Exception, cursor closing, or join() on this" "thread.") self.update_checkpoint(last_ts) LOG.debug("OplogThread: Sleeping. Documents removed: %d, " "upserted: %d, updated: %d" % (remove_inc, upsert_inc, update_inc)) time.sleep(2)