def _generate_workers(self, files, state, start_utc_secs, end_utc_secs): """Generate the threads that tails the data sources and put the fetched entries to the files""" # Create working threads to handle to track/dump mongodb activities workers_info = [] doc_queue = Queue.Queue() # Writer thread, we only have one writer since we assume all files will # be written to the same device (disk or SSD), as a result it yields # not much benefit to have multiple writers. workers_info.append({ "name": "write-all-docs-to-file", "thread": Thread( target=MongoQueryRecorder._process_doc_queue, args=(doc_queue, files, state)) }) tailer = utils.get_oplog_tailer(self.oplog_client, # we are only interested in "insert" ["i"], self.config["target_databases"], self.config["target_collections"], Timestamp(start_utc_secs, 0)) oplog_cursor_id = tailer.cursor_id workers_info.append({ "name": "tailing-oplogs", "on_close": lambda: self.oplog_client.kill_cursors([oplog_cursor_id]), "thread": Thread( target=tail_to_queue, args=(tailer, "oplog", doc_queue, state, Timestamp(end_utc_secs, 0))) }) start_datetime = datetime.utcfromtimestamp(start_utc_secs) end_datetime = datetime.utcfromtimestamp(end_utc_secs) for profiler_name,client in self.profiler_clients.items(): # create a profile collection tailer for each db for db in self.config["target_databases"]: tailer = utils.get_profiler_tailer(client, db, self.config["target_collections"], start_datetime) tailer_id = "%s_%s" % (db, profiler_name) profiler_cursor_id = tailer.cursor_id workers_info.append({ "name": "tailing-profiler for %s on %s" % (db, profiler_name), "on_close": lambda: self.profiler_client.kill_cursors([profiler_cursor_id]), "thread": Thread( target=tail_to_queue, args=(tailer, tailer_id, doc_queue, state, end_datetime)) }) for worker_info in workers_info: utils.LOG.info("Starting thread: %s", worker_info["name"]) worker_info["thread"].setDaemon(True) worker_info["thread"].start() return workers_info
def main(): db_config = config.DB_CONFIG duration_in_sec = int(sys.argv[1]) output = open(sys.argv[2], "w") # Get the tailer for oplog mongodb_uri = db_config["oplog_server"]["mongodb_uri"] oplog_client = MongoClient(mongodb_uri) tailer = utils.get_oplog_tailer(oplog_client, ["i", "u"], db_config["target_database"], db_config["target_collections"]) # sanitize write_to_file(tailer, duration_in_sec, output)
def main(): db_config = config.DB_CONFIG duration_in_sec = int(sys.argv[1]) output = open(sys.argv[2], "w") # Get the tailer for oplog oplog_server = db_config["oplog_server"] oplog_client = MongoClient(oplog_server["host"], oplog_server["port"]) tailer = utils.get_oplog_tailer(oplog_client, ["i", "u"], db_config["target_database"], db_config["target_collections"]) # sanitize write_to_file(tailer, duration_in_sec, output)
def _generate_workers(self, files, state, start_utc_secs, end_utc_secs): """Generate the threads that tail the data sources and put the fetched entries to the files""" # Initialize a thread-safe queue that we'll put the docs into doc_queue = Queue.Queue() # Initialize a list that will keep track of all the worker threads that # handle tracking/dumping of mongodb activities workers_info = [] # Writer thread, we only have one writer since we assume all files will # be written to the same device (disk or SSD), as a result it yields # not much benefit to have multiple writers. workers_info.append({ "name": WRITER_THREAD_NAME, "thread": Thread(target=MongoQueryRecorder._process_doc_queue, args=(doc_queue, files, state)) }) # For each server in the "oplog_servers" config... for server_string, mongo_client in self.oplog_clients.items(): # Create a tailing cursor (aka a tailer) on an oplog collection. # "i" stands for the only op type we care about, which is an insert tailer = utils.get_oplog_tailer(mongo_client, ["i"], self.config["target_databases"], self.config["target_collections"], Timestamp(start_utc_secs, 0)) # Create a new thread and add some metadata to it workers_info.append({ "name": "tailing-oplogs on %s" % server_string, "on_close": lambda: self.oplog_client.kill_cursors([tailer.cursor_id]), "thread": Thread(target=tail_to_queue, args=(tailer, "oplog", doc_queue, state, Timestamp(end_utc_secs, 0))) }) start_datetime = datetime.utcfromtimestamp(start_utc_secs) end_datetime = datetime.utcfromtimestamp(end_utc_secs) # For each server in the "profiler_servers" config... for server_string, mongo_client in self.profiler_clients.items(): # For each database in the "target_databases" config... for db in self.config["target_databases"]: # Create a tailing cursor (aka a tailer) on a profile collection tailer = utils.get_profiler_tailer( mongo_client, db, self.config["target_collections"], start_datetime) # Create a new thread and add some metadata to it tailer_id = "%s_%s" % (db, server_string) workers_info.append({ "name": "tailing-profiler for %s on %s" % (db, server_string), "on_close": lambda: self.profiler_client.kill_cursors( [tailer.cursor_id]), "thread": Thread(target=tail_to_queue, args=(tailer, tailer_id, doc_queue, state, end_datetime)) }) # Deamonize each thread and start it for worker_info in workers_info: utils.LOG.info("Starting thread: %s", worker_info["name"]) worker_info["thread"].setDaemon(True) worker_info["thread"].start() # Return the list of all the started threads return workers_info
def _generate_workers(self, files, state, start_utc_secs, end_utc_secs): """Generate the threads that tail the data sources and put the fetched entries to the files""" # Initialize a thread-safe queue that we'll put the docs into doc_queue = Queue.Queue() # Initialize a list that will keep track of all the worker threads that # handle tracking/dumping of mongodb activities workers_info = [] # Writer thread, we only have one writer since we assume all files will # be written to the same device (disk or SSD), as a result it yields # not much benefit to have multiple writers. workers_info.append({ "name": "write-all-docs-to-file", "thread": Thread( target=MongoQueryRecorder._process_doc_queue, args=(doc_queue, files, state)) }) # For each server in the "oplog_servers" config... for server_string, mongo_client in self.oplog_clients.items(): # Create a tailing cursor (aka a tailer) on an oplog collection. # "i" stands for the only op type we care about, which is an insert tailer = utils.get_oplog_tailer(mongo_client, ["i"], self.config["target_databases"], self.config["target_collections"], Timestamp(start_utc_secs, 0)) # Create a new thread and add some metadata to it workers_info.append({ "name": "tailing-oplogs on %s" % server_string, "on_close": lambda: self.oplog_client.kill_cursors([tailer.cursor_id]), "thread": Thread( target=tail_to_queue, args=(tailer, "oplog", doc_queue, state, Timestamp(end_utc_secs, 0))) }) start_datetime = datetime.utcfromtimestamp(start_utc_secs) end_datetime = datetime.utcfromtimestamp(end_utc_secs) # For each server in the "profiler_servers" config... for server_string, mongo_client in self.profiler_clients.items(): # For each database in the "target_databases" config... for db in self.config["target_databases"]: # Create a tailing cursor (aka a tailer) on a profile collection tailer = utils.get_profiler_tailer(mongo_client, db, self.config["target_collections"], start_datetime) # Create a new thread and add some metadata to it tailer_id = "%s_%s" % (db, server_string) workers_info.append({ "name": "tailing-profiler for %s on %s" % (db, server_string), "on_close": lambda: self.profiler_client.kill_cursors([tailer.cursor_id]), "thread": Thread( target=tail_to_queue, args=(tailer, tailer_id, doc_queue, state, end_datetime)) }) # Deamonize each thread and start it for worker_info in workers_info: utils.LOG.info("Starting thread: %s", worker_info["name"]) worker_info["thread"].setDaemon(True) worker_info["thread"].start() # Return the list of all the started threads return workers_info
def _generate_workers(self, files, state, start_utc_secs, end_utc_secs): """Generate the threads that tails the data sources and put the fetched entries to the files""" # Create working threads to handle to track/dump mongodb activities workers_info = [] doc_queue = Queue.Queue() # Writer thread, we only have one writer since we assume all files will # be written to the same device (disk or SSD), as a result it yields # not much benefit to have multiple writers. workers_info.append({ "name": "write-all-docs-to-file", "thread": Thread(target=MongoQueryRecorder._process_doc_queue, args=(doc_queue, files, state)) }) tailer = utils.get_oplog_tailer( self.oplog_client, # we are only interested in "insert" ["i"], self.config["target_databases"], self.config["target_collections"], Timestamp(start_utc_secs, 0)) oplog_cursor_id = tailer.cursor_id workers_info.append({ "name": "tailing-oplogs", "on_close": lambda: self.oplog_client.kill_cursors([oplog_cursor_id]), "thread": Thread(target=tail_to_queue, args=(tailer, "oplog", doc_queue, state, Timestamp(end_utc_secs, 0))) }) start_datetime = datetime.utcfromtimestamp(start_utc_secs) end_datetime = datetime.utcfromtimestamp(end_utc_secs) for profiler_name, client in self.profiler_clients.items(): # create a profile collection tailer for each db for db in self.config["target_databases"]: tailer = utils.get_profiler_tailer( client, db, self.config["target_collections"], start_datetime) tailer_id = "%s_%s" % (db, profiler_name) profiler_cursor_id = tailer.cursor_id workers_info.append({ "name": "tailing-profiler for %s on %s" % (db, profiler_name), "on_close": lambda: self.profiler_client.kill_cursors( [profiler_cursor_id]), "thread": Thread(target=tail_to_queue, args=(tailer, tailer_id, doc_queue, state, end_datetime)) }) for worker_info in workers_info: utils.LOG.info("Starting thread: %s", worker_info["name"]) worker_info["thread"].setDaemon(True) worker_info["thread"].start() return workers_info