def __init__(self, pid, drive_select=1, rewrite_path=None, debug=False, debug_threshold=128): """initialize""" self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.drive_select = drive_select self.rewrite_path = rewrite_path ## if we're not using disk queuing we open the drives differently; ## we need to track different states ## for faster archiving we keep some data in memory instead of queuing to disk self.archive_bytes = BytesIO() self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes) self.archive_info = tarfile.TarInfo() ## tape opened with tar ## this is a dictionary where we will do: ## self.tape_drive[drive_int] = tarfile.open(mode='w:') self.tape_filehandle = {} self.tape_drive = {} ## if we use tarfile, we need to track the state self.drive_states = RamTarStateCode self.drive_state = self.ramtar_tape_drive(drive_select, self.drive_states.drive_init)
def __init__(self, version, credentials, pid, debug=False, debug_threshold=255): """Initialize connection and collect file_list of files to dump. :type version: int :type credentials: string :type pid: basestring :type debug: bool :type debug_threshold: int """ self.pid = pid self.version = version self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode self.paperdb_state_code = PaperDBStateCode self.paperdb_state = self.paperdb_state_code.initialize self.connection_timeout = 90 self.connection_time = timedelta() self.check_credentials_file(credentials) self.credentials = credentials self.connect = '' self.cur = '' self.db_connect('init', credentials) self.file_list = [] self.file_md5_dict = {} self.claimed_files = [] self.claimed_state = 0
def __init__(self, pid, drive_select=2, debug=False, disk_queue=True, debug_threshold=128): """initialize debugging and pid""" self.pid = pid self.debug = Debug(pid, debug=debug, debug_threshold=debug_threshold) self.drive_select = drive_select
def __init__(self, version, pid, tape_size, disk_queue=True, drive_select=2, debug=False, debug_threshold=255): """init with debugging :type drive_select: int :param drive_select: 0 = nst0, 1 = nst1, 2 = nst{1,2} :type disk_queue: bool :param disk_queue: write archives to a disk queue first? """ self.version = version self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.tape_size = tape_size self._tape_dev = '/dev/changer' self.status_code = StatusCode self.drive_select = drive_select self.archive_tar = '' self.drive_ids = [] self.tape_ids = [] self.label_in_drive = [] ## return label in given drive self.check_inventory() self.tape_drives = Drives(self.pid, drive_select=drive_select, debug=debug, debug_threshold=debug_threshold) self.disk_queue = disk_queue if not self.disk_queue: ## we need to use Ramtar self.ramtar = FastTar(pid, drive_select=drive_select, rewrite_path=None, debug=debug, debug_threshold=debug_threshold) ## TODO(dconover): implement a lock on the changer to prevent overlapping requests self.changer_state = 0
def __init__(self, version, pid, debug=False, debug_threshold=255, local_transfer=True): """Archive file and tar management :type version: int :type pid: basestring :type local_transfer: bool :type debug_threshold: int :type debug: bool :type self: object """ self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.version = version #self.transfer = LocalTransfer() if local_transfer else Transfer() self.transfer = LocalTransfer() if local_transfer else None dir_status, self.archive_copy_dir = self.ensure_dir( '/papertape/shm/%s/' % self.pid) dir_status, self.queue_dir = self.ensure_dir('/papertape/queue/%s/' % self.pid) if dir_status is not True: self.debug.output('data dir init failed') raise Exception self.catalog_name = "{0:s}/paper.{1:s}.file_list".format( self.queue_dir, self.pid) self.tape_ids_filename = "{0:s}/paper.{1:s}.tape_ids.file_list".format( self.queue_dir, self.pid) self.archive_list = [] ## working file_list of files to write self.tape_list = [] ## cumulative file_list of written files self.item_index = 0 ## number of file path index (human readable line numbers in catalog) self.archive_state = 0 ## current archive state
def __init__(self, version, credentials, pid, debug=False, debug_threshold=255): """Initialize connection and collect file_list of tape_ids.""" self.version = version self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode ## database variables self.connection_timeout = 90 self.connection_time = datetime.timedelta() self.credentials = credentials self.connect = '' self.cur = '' self.db_connect('init', credentials) self.mtxdb_state = 0 ## current dump state
class FastTar(RamTar): """handling python tarfile opened directly against tape devices""" def __init__(self, pid, drive_select=1, rewrite_path=None, debug=False, debug_threshold=128): """initialize""" self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.drive_select = drive_select self.rewrite_path = rewrite_path ## if we're not using disk queuing we open the drives differently; ## we need to track different states ## for faster archiving we keep some data in memory instead of queuing to disk self.archive_tar = '' ## tape opened with tar ## this is a dictionary where we will do: ## self.tape_drive[drive_int] = tarfile.open(mode='w:') self.tape_filehandle = {} self.tape_drive = {} ## if we use tarfile, we need to track the state self.drive_states = RamTarStateCode self.drive_state = self.ramtar_tape_drive(drive_select, self.drive_states.drive_init) def send_archive_to_tape(self, drive_int, archive_list, archive_name, archive_file): """send the current archive to tape""" try: self.debug.output('{}'.format(archive_name)) self.ramtar_tape_drive(drive_int, self.drive_states.drive_open) self.debug.output('{}'.format(self.drive_state)) ## add archive_list self.tape_drive[drive_int].add(archive_list) ## write the tape #self.tape_drive[drive_int].add(archive_file) self.ramtar_tape_drive(drive_int, self.drive_states.drive_close) ## truncate the current archive to save disk space archive_open = open(archive_file, 'w') archive_open.truncate(0) except Exception as cept: self.debug.output('tarfile - {}'.format(cept)) raise
def __init__(self, credentials, debug=False, pid=None, disk_queue=True, drive_select=2, debug_threshold=255): """initialize""" self.version = __version__ self.pid = "%0.6d%0.3d" % (getpid(), randint(1, 999)) if pid is None else pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode self.mtx_creds = '~/.my.mtx.cnf' self.debug.output(credentials) self.paper_creds = credentials self.tape_ids = '' ## each dump process 6gb to /dev/shm (two at a time) self.batch_size_mb = 12000 ## (1.5Tb -1 batch) self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb #self.tape_size = 13000 ## setup PaperDB connection #self.paperdb = PaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold) ## test database self.paperdb = TestPaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold) ## reload test data #self.paperdb.load_sample_data() ## setup tape library self.labeldb = MtxDB(self.version, self.mtx_creds, self.pid, debug=debug, debug_threshold=debug_threshold) ## setup file access self.files = Archive(self.version, self.pid, debug=debug, debug_threshold=debug_threshold) ## use the pid here to lock changer self.drive_select = drive_select self.tape = Changer(self.version, self.pid, self.tape_size, debug=True, drive_select=drive_select, disk_queue=disk_queue, debug_threshold=debug_threshold) self.dump_list = [] self.tape_index = 0 self.tape_used_size = 0 ## each dump process should write one tape worth of data self.dump_state_code = DumpStateCode self.dump_state = self.dump_state_code.initialize
class Dump(object): """Coordinate a dump to tape based on deletable files in database""" def __init__(self, credentials, debug=False, pid=None, disk_queue=True, drive_select=2, debug_threshold=255): """initialize""" self.version = __version__ self.pid = "%0.6d%0.3d" % (getpid(), randint(1, 999)) if pid is None else pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode self.mtx_creds = '~/.my.mtx.cnf' self.debug.output(credentials) self.paper_creds = credentials self.tape_ids = '' ## each dump process 6gb to /dev/shm (two at a time) self.batch_size_mb = 12000 ## (1.5Tb -1 batch) self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb #self.tape_size = 13000 ## setup PaperDB connection #self.paperdb = PaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold) ## test database self.paperdb = TestPaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold) ## reload test data #self.paperdb.load_sample_data() ## setup tape library self.labeldb = MtxDB(self.version, self.mtx_creds, self.pid, debug=debug, debug_threshold=debug_threshold) ## setup file access self.files = Archive(self.version, self.pid, debug=debug, debug_threshold=debug_threshold) ## use the pid here to lock changer self.drive_select = drive_select self.tape = Changer(self.version, self.pid, self.tape_size, debug=True, drive_select=drive_select, disk_queue=disk_queue, debug_threshold=debug_threshold) self.dump_list = [] self.tape_index = 0 self.tape_used_size = 0 ## each dump process should write one tape worth of data self.dump_state_code = DumpStateCode self.dump_state = self.dump_state_code.initialize def archive_to_tape(self): """master method to loop through files to write data to tape""" ## get a file_list of files, transfer to disk, write to tape while self.tape_used_size + self.batch_size_mb < self.tape_size: ## get a file_list of files to dump archive_list, archive_size = self.get_list(self.batch_size_mb) if archive_list: try: ## copy files to b5, gen catalog file self.files.build_archive(archive_list) ## files to tar on disk with catalog self.files.queue_archive(self.tape_index, archive_list) ## mark where we are self.dump_state = self.dump_state_code.dump_queue except Exception as error: self.debug.output( 'archive build/queue error {}'.format(error)) self.close_dump() ## Files in these lists should be identical, but archive_list has extra data ## archive_list: [[0, 1, 'test:/testdata/testdir'], [0, 2, 'test:/testdata/testdir2'], ... ] ## archive_list: ['test:/testdata/testdir', 'test:/testdata/testdir2', ... ] self.debug.output('archive_list - %s' % self.files.archive_list) self.debug.output('file_list - %s' % archive_list) ## queue archive does the job of making the archive_list we need to update the tape_list self.files.tape_list.extend(self.files.archive_list) self.debug.output( "q:%s l:%s t:%s" % (self.tape_used_size, archive_size, self.tape_size)) ## add archive_size to current tape_used_size self.tape_used_size += archive_size self.tape_index += 1 else: ## we ran out of files self.debug.output('file file_list empty') break if self.tape_used_size > 0: self.debug.output( 'sending queued files to tar - %s, %s' % (len(self.files.tape_list), self.files.tape_list)) self.files.gen_final_catalog(self.files.catalog_name, self.files.tape_list, self.paperdb.file_md5_dict) if self.drive_select == 2: ## use two tape drives to write data at the same time self.debug.output('using two drives') self.tar_archive(self.files.catalog_name) else: ## use one drive to write to two tapes serially self.debug.output('using one drive') self.tar_archive_single(self.files.catalog_name) else: ## no files found self.debug.output('Abort - no files found') self.close_dump() def get_list(self, limit=7500, regex=False, pid=False, claim=True): """get a file_list less than limit size""" ## get a 7.5 gb file_list of files to transfer self.dump_list, list_size = self.paperdb.get_new(limit, regex=regex, pid=pid) ## claim the files so other jobs can request different files if self.dump_list and claim: self.debug.output(str(list_size)) self.paperdb.claim_files(self.dump_list) return self.dump_list, list_size def tar_archive_single(self, catalog_file): """send archives to single tape drive using tar""" ## track how many copies are written tape_copy = 1 tar_archive_single_status = self.status_code.OK ## select ids tape_label_ids = self.labeldb.select_ids() self.labeldb.claim_ids(tape_label_ids) ## load up a fresh set of tapes for label_id in tape_label_ids: self.debug.output('load tape', label_id, debug_level=128) self.tape.load_tape_drive(label_id) ## tar files to tape self.debug.output('prep tape', debug_level=128) self.tape.prep_tape(catalog_file) for tape_index in range(self.tape_index): self.debug.output('sending tar to single drive', str(tape_index), debug_level=225) try: self.tape.write(tape_index) except Exception as error: self.debug.output('tape write fail {}'.format(error)) self.close_dump() break ## we have written two copies if tape_copy == 2: ## update the dump state self.dump_state = self.dump_state_code.dump_write dump_verify_status = self.dump_verify(label_id) if dump_verify_status is not self.status_code.OK: self.debug.output( 'Fail: dump_verify {}'.format(dump_verify_status)) tar_archive_single_status = self.status_code.tar_archive_single_dump_verify self.close_dump() break if tape_copy == 2: self.dump_state = self.dump_state_code.dump_verify self.debug.output('unloading drive', label_id, debug_level=128) self.tape.unload_tape_drive(label_id) ## track tape copy tape_copy += 1 ## update the current dump state if tar_archive_single_status is self.status_code.OK: log_label_ids_status = self.log_label_ids(tape_label_ids, self.files.tape_list) if log_label_ids_status is not self.status_code.OK: self.debug.output('problem writing labels out: {}'.format( log_label_ids_status)) else: self.debug.output( "Abort dump: {}".format(tar_archive_single_status)) self.close_dump() def log_label_ids(self, tape_label_ids, tape_list): """send label ids to db""" log_label_ids_status = self.status_code.OK log_label_ids_status = self.paperdb.write_tape_index( self.files.tape_list, ','.join(tape_label_ids)) if log_label_ids_status is not self.status_code.OK: self.debug.output( 'problem writing label: {}'.format(log_label_ids_status)) self.files.save_tape_ids(','.join(tape_label_ids)) log_label_ids_status = self.labeldb.date_ids(tape_label_ids) if log_label_ids_status is not self.status_code.OK: self.debug.output( 'problem dating labels: {}'.format(log_label_ids_status)) return log_label_ids_status def dump_verify(self, tape_id): """take the tape_id and run a self check, then confirm the tape_list matches """ dump_verify_status = self.status_code.OK ## run a tape_self_check self_check_status, item_index, catalog_list, md5_dict, tape_pid = self.tape_self_check( tape_id) ## take output from tape_self_check and compare against current dump if self_check_status is self.status_code.OK: self.debug.output('confirming item_count {} == {}'.format( self.files.item_index, int(item_index))) if self.files.item_index != int(item_index): self.debug.output( "%s mismatch: %s, %s" % ("item_count", self.files.item_index, item_index)) dump_verify_status = self.status_code.dump_verify_item_index self.debug.output('confirming %s' % "catalog") if self.files.tape_list != catalog_list: self.debug.output( "%s mismatch: %s, %s" % ("catalog", self.files.tape_list, catalog_list)) dump_verify_status = self.status_code.dump_verify_catalog self.debug.output('confirming %s' % "md5_dict") if self.paperdb.file_md5_dict != md5_dict: self.debug.output( "%s mismatch: %s, %s" % ("md5_dict", self.paperdb.file_md5_dict, md5_dict), debug_level=253) dump_verify_status = self.status_code.dump_verify_md5_dict self.debug.output('confirming %s' % "pid") if self.pid != str(tape_pid): self.debug.output("%s mismatch: %s, %s" % ("pid", self.pid, tape_pid)) dump_verify_status = self.status_code.dump_verify_pid else: self.debug.output('Fail: tape_self_check_status: %s' % self_check_status) return self_check_status self.debug.output('final {}'.format(dump_verify_status)) return dump_verify_status def tape_self_check(self, tape_id): """process to take a tape and run integrity check without reference to external database :rtype : bool """ tape_self_check_status = self.status_code.OK ## load the tape if necessary ## TODO(dconover): call with the correct tape drive_int or unload tape before tape_self_check self.tape.load_tape_drive(tape_id) ## read tape_catalog as file_list self.debug.output('read catalog from tape: %s' % tape_id) first_block = self.tape.read_tape_catalog(tape_id) ## parse the archive_list ## build an file_md5_dict item_index, catalog_list, md5_dict, tape_pid = self.files.final_from_file( catalog=first_block) tape_archive_md5_status, reference = self.tape.tape_archive_md5( tape_id, tape_pid, catalog_list, md5_dict) if tape_archive_md5_status is not self.status_code.OK: self.debug.output( "tape failed md5 inspection at index: %s, status: %s" % (reference, tape_archive_md5_status)) tape_self_check_status = tape_archive_md5_status return tape_self_check_status, item_index, catalog_list, md5_dict, tape_pid def tar_archive(self, catalog_file): """send archives to tape drive pair using tar""" ## select ids tape_label_ids = self.labeldb.select_ids() self.labeldb.claim_ids(tape_label_ids) ## load up a fresh set of tapes self.tape.load_tape_pair(tape_label_ids) ## tar files to tape self.tape.prep_tape(catalog_file) for tar_index in range(self.tape_index): self.debug.output('sending to tape file - %s' % str(tar_index)) try: self.tape.write(tar_index) except Exception as error: self.debug.output('tape writing exception {}'.format(error)) break self.tape.unload_tape_pair() ## write tape locations self.debug.output('writing tape_indexes - %s' % self.files.tape_list) self.paperdb.write_tape_index(self.files.tape_list, ','.join(tape_label_ids)) self.debug.output('updating mtx.ids with date') self.labeldb.date_ids(tape_label_ids) def close_dump(self): """orderly close of dump""" def _close_init(): """simple cleanup""" pass def _close_list(): """we have claimed files to cleanup""" self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim def _close_queue(): """files are queued""" self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_queue def _close_write(): """files written to tape""" self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_write def _close_verify(): """files verified""" self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_verify close_action = { self.dump_state_code.initialize: _close_init, self.dump_state_code.dump_list: _close_list, self.dump_state_code.dump_queue: _close_queue, self.dump_state_code.dump_write: _close_write, self.dump_state_code.dump_verify: _close_verify, } ## prep cleanup state close_action[self.dump_state]() ## do module cleanup self.paperdb.close_paperdb() self.files.close_archive() self.labeldb.close_mtxdb() self.tape.close_changer() ## exit exit(self.dump_state.value)
class DumpFaster(DumpFast): """Queless archiving means that the data is never transferred to our disk queues Disk queues are still used to maintain state in the event of a partial dump failure Tape verification is rewritten to make use of python threading. """ def __init__(self, credentials='/papertape/etc/my.papertape-test.cnf', mtx_credentials='/home2/obs/.my.mtx.cnf', debug=False, pid=None, disk_queue=True, drive_select=2, debug_threshold=255): """initialize""" self.version = __version__ self.pid = "%0.6d%0.3d" % (getpid(), randint(1, 999)) if pid is None else pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode self.check_credentials_file(mtx_credentials) self.mtx_creds = mtx_credentials self.debug.output(credentials) self.check_credentials_file(credentials) self.paper_creds = credentials self.tape_ids = '' ## each dump process 12GB to /dev/shm (two at a time) self.batch_size_mb = 12000 ## (1.5Tb -1 batch) self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb #self.tape_size = 13000 ## setup PaperDB connection self.paperdb = PaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold) ## setup tape library self.labeldb = MtxDB(self.version, self.mtx_creds, self.pid, debug=debug, debug_threshold=debug_threshold) ## setup file access self.files = Archive(self.version, self.pid, debug=debug, debug_threshold=debug_threshold) ## use the pid here to lock changer self.drive_select = drive_select self.tape = Changer(self.version, self.pid, self.tape_size, debug=True, drive_select=drive_select, disk_queue=disk_queue, debug_threshold=debug_threshold) self.dump_list = [] self.tape_index = 0 self.tape_used_size = 0 ## each dump process should write one tape worth of data self.dump_state_code = DumpStateCode self.dump_state = self.dump_state_code.initialize def check_credentials_file(self, credentials): """Run checks on a credentials file; currently just check that it exists and is not empty. :type credentials: string """ ## return true if the credentials file exists and is not zero size path.isfile(credentials) and path.getsize(credentials) > 0 def dump_pair_verify(self, tape_label_ids): """This is a wrapper to perform a threaded version of the original call to dump_verify(). Our "threading" is implemented in three steps: 1. instantiate VerifyThread (that calls dump_verify()) and start each thread 2. wait on each thread and get the verification status code from each 3. check each status code and return failure if either is not "OK" """ ## thread instances need to be started, we can use the output to make a list of started threads def _start_verification(thread): thread.start() return thread ## join() will block until the thread completes, then we can retrieve the status from the verification def _get_verification_status(thread): thread.join() return thread.dump_verify_status ## given a pair of verification status codes, return a "non-OK" status if either is not "OK" def _check_thread_status(status_1, status_2): return status_1 if status_1 is not self.status_code.OK else status_2 ## foreach label, start a thread and add it to a list started_threads = [ _start_verification(VerifyThread(label_id, drive, self)) for drive, label_id in enumerate(tape_label_ids) ] ## foreach thread, check the verification status and add it to a list return_codes = [ _get_verification_status(thread) for thread in started_threads ] ## foreach status code, check if either is not "OK" return reduce(_check_thread_status, return_codes) def fast_batch(self): """skip tar of local archive on disk send files to two tapes using a single drive.""" ## batch_files() does the job of making the lists that queue_archive does ## it also updates self.tape_index which is used by Changer.write() self.debug.output('reloading sample data into paperdatatest database') if self.batch_files(): self.debug.output('found %s files' % len(self.files.tape_list)) self.files.gen_final_catalog(self.files.catalog_name, self.files.tape_list, self.paperdb.file_md5_dict) self.tar_archive_fast(self.files.catalog_name) return True else: self.debug.output("no files batched") return self.dump_state_code.dump_list_fail def tar_archive_fast(self, catalog_file): """Archive files directly to tape using only a single drive to write 2 tapes""" tar_archive_fast_status = self.status_code.OK ## select ids tape_label_ids = self.labeldb.select_ids() ## load up a fresh set of tapes self.tape.load_tape_pair(tape_label_ids) ## add the catalog to the beginning of the tape for label_id in tape_label_ids: self.debug.output('archiving to label_id - {}'.format(label_id)) ## prepare the first block of the tape with the current tape_catalog self.tape.prep_tape(catalog_file) ## actually write the files in the catalog to a tape pair self.debug.output('got list - {}'.format(self.files.tape_list)) self.tape.archive_from_list(self.files.tape_list) ## check the status of the dumps tar_archive_fast_status = self.dump_pair_verify(tape_label_ids) ## unload the tape pair self.tape.unload_tape_pair() ## update the db if the current dump status is OK if tar_archive_fast_status is self.status_code.OK: log_label_ids_status = self.log_label_ids(tape_label_ids) if log_label_ids_status is not self.status_code.OK: self.debug.output('problem writing labels out: {}'.format( log_label_ids_status)) else: self.debug.output("Abort dump: {}".format(tar_archive_fast_status)) self.close_dump()
class PaperDB(object): """Paper database contains information on file locations""" def __init__(self, version, credentials, pid, debug=False, debug_threshold=255): """Initialize connection and collect file_list of files to dump. :type version: int :type credentials: string :type pid: basestring :type debug: bool :type debug_threshold: int """ self.pid = pid self.version = version self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode self.paperdb_state_code = PaperDBStateCode self.paperdb_state = self.paperdb_state_code.initialize self.connection_timeout = 90 self.connection_time = timedelta() self.check_credentials_file(credentials) self.credentials = credentials self.connect = '' self.cur = '' self.db_connect('init', credentials) self.file_list = [] self.file_md5_dict = {} self.claimed_files = [] self.claimed_state = 0 def __setattr__(self, attr_name, attr_value): """debug.output() when a state variable is updated""" class_name = self.__class__.__name__.lower() ## we always use the lowercase of the class_name in the state variable if attr_name == 'paperdb_state': ## debug whenever we update the state variable self.debug.output("updating: {} with {}={}".format(class_name, attr_name, attr_value)) super().__setattr__(attr_name, attr_value) def check_credentials_file(self, credentials): """Run checks on a credentials file; currently just check that it exists and is not empty. this class should really implement a more thorough credentials file check since this check is replicated in the dump class already. Parameters: :type credentials: string """ ## return true if the credentials file exists and is not zero size path.isfile(credentials) and path.getsize(credentials) > 0 def update_connection_time(self): """refresh database connection time""" self.debug.output('updating connection_time') self.connection_time = datetime.now() def connection_time_delta(self): """return connection age""" self.debug.output('connection_time:%s' % self.connection_time) delta = datetime.now() - self.connection_time return delta.total_seconds() def db_connect(self, command=None, credentials=None): """connect to the database or reconnect an old session""" self.debug.output('input:%s %s' % (command, credentials)) self.credentials = credentials if credentials is not None else self.credentials time_delta = self.connection_timeout + 1 if command == 'init' else self.connection_time_delta() self.debug.output("time_delta:%s, timeout:%s" % (time_delta, self.connection_timeout)) if time_delta > self.connection_timeout: self.debug.output("setting connection %s %s" % (credentials, self.connection_timeout)) self.connect = pymysql.connect(read_default_file=self.credentials, connect_timeout=self.connection_timeout) self.cur = self.connect.cursor() self.update_connection_time() self.debug.output("connection_time:%s" % self.connection_time) def get_new(self, size_limit, regex=False, pid=False): """Retrieve a file_list of available files. Outputs files that are "write_to_tape" Optionally, limit search by file_path regex or pid in tape_index Arguments: :param size_limit: int :param regex: str :param pid: bool """ if regex: ready_sql = """select source, filesize, md5sum from File where source is not null and filetype like 'uv%' and is_tapeable = 1 and tape_index is null and source like '%s' """ % regex elif pid: ready_sql = """select source, filesize, md5sum from File where tape_index = 1{0:s} """.format(pid) else: ready_sql = """select source, filesize, md5sum from File where source is not null and filetype like 'uv%' and is_tapeable = 1 and tape_index is null group by source order by filename; """ self.db_connect() self.cur.execute(ready_sql) self.update_connection_time() self.file_list = [] total = 0 for file_info in self.cur.fetchall(): self.debug.output('found file - %s' % file_info[0], debug_level=254) file_size = float(file_info[1]) ## when size_limit is set to 0, change limit to 1 plus total + file_size if size_limit == 0: size_limit = total + file_size + 1 ## if the reported size is larger than the size limit we have a problem if file_size > size_limit: self.debug.output('file_size (%s) larger than size limit(%s) - %s' % (file_size, size_limit, file_info[0]), debug_level=254) ## check that we don't go over the limit if total+file_size < size_limit: self.debug.output('file:', file_info[0], debug_level=254) self.file_list.append(file_info[0]) self.file_md5_dict[file_info[0]] = file_info[2] total += file_size return self.file_list, total def enumerate_paths(self): ## run query with no size limit ## remove "is_tapeable=1" ready_sql = """select source from File where source is not null and filetype like 'uv%' /* and is_tapeable = 1 */ and tape_index is null group by source order by filename; """ self.db_connect() self.cur.execute(ready_sql) self.update_connection_time() dir_list = {} for file_info in self.cur.fetchall(): ## parse paths ## like $host:/{mnt/,}$base/$subpath/$file path_regex = re.compile(r'(.*:)(/mnt/|/)(\w+)/') path_info = path_regex.match(file_info[0]).groups() base_path = path_info[0] + path_info[1] + path_info[2] dir_list[base_path] = dir_list[base_path] + 1 if base_path in dir_list else 0 ## return array return dir_list def claim_files(self, file_list=None, unclaim=False): """Mark files in the database that are "claimed" by a dump process.""" status_type = self.paperdb_state.value ## if no file_list is passed assume we are updating existing file_list if file_list is None: file_list = self.claimed_files claim_files_status = self.status_code.OK self.db_connect() ## build an sql to unclaim the given files for file_name in file_list: if unclaim is True: update_sql = "update File set tape_index=null where source='%s' and tape_index='%s%s'" % (file_name, status_type, self.pid) else: ## TODO(dconover): allow claim to use current state status_type = self.paperdb_state_code.claim.value update_sql = "update File set tape_index='%s%s' where source='%s'" % (status_type, self.pid, file_name) self.debug.output('claim_files - %s' % update_sql) try: self.cur.execute(update_sql) except Exception as mysql_error: self.debug.output('mysql_error {}'.format(mysql_error)) claim_files_status = self.status_code.claim_files_sql_build ## run the actual sql to unclaim the files try: self.connect.commit() self.claimed_state = status_type self.claimed_files.extend(file_list) except Exception as mysql_error: self.debug.output('mysql_error {}'.format(mysql_error)) claim_files_status = self.status_code.claim_files_sql_commit self.paperdb_state = self.paperdb_state_code.claim return claim_files_status def unclaim_files(self, file_list=None): """Release claimed files from database :rtype : bool """ self.claim_files(file_list, unclaim=True) def write_tape_index(self, tape_list, tape_id): """Take a dictionary of files and labels and update the database record the barcode of tape in the tape_index field, but not setting the is_deletable field to 1 for all files just written to tape. :param tape_list: dict :param tape_id: str """ write_tape_index_status = self.status_code.OK self.debug.output("tape_list contains %s files, and with ids: %s" % (len(tape_list), tape_id)) self.db_connect() ## item file_list is set in paper_io.py: self.tape_list.append([queue_pass, int, file]) for item in tape_list: ## tape_index: 20150103[PAPR2001,PAPR2001]-132:3 tape_index = "%s[%s]-%s:%s" % (self.version, tape_id, item[0], item[1]) source = item[2] self.debug.output("writing tape_index: %s for %s" % (tape_index, source)) try: self.cur.execute('update File set tape_index="%s", is_deletable=1 where source="%s"' % (tape_index, source)) except Exception as mysql_error: self.debug.output('error {}'.format(mysql_error)) write_tape_index_status = self.status_code.write_tape_index_mysql try: self.connect.commit() except Exception as mysql_error: self.debug.output('error {}'.format(mysql_error)) write_tape_index_status = self.status_code.write_tape_index_mysql return write_tape_index_status def check_tape_locations(self, catalog_list, tape_id): """Take a dictionary of files and labels and confirm existence of files on tape. :param catalog_list: dict :param tape_id: str """ pass def close_paperdb(self): """depending on state clean-up file claims""" def _close(): """close the database leave any files in place :rtype : bool """ _close_status = True try: ## close database connections self.cur.close() except Exception as mysql_error: self.debug.output('mysql error {}'.format(mysql_error)) _close_status = False return _close_status def _unclaim(): """unlcaim files in database; close database :rtype : bool """ #_unclaim_status = True self.unclaim_files() return _close() close_action = { self.paperdb_state_code.initialize : _close, self.paperdb_state_code.claim : _unclaim, self.paperdb_state_code.claim_queue : _close, self.paperdb_state_code.claim_write : _close, self.paperdb_state_code.claim_verify : _close, } self.db_connect() self.update_connection_time() close_action[self.paperdb_state]() def __del__(self): """close out the connection and set the final state in the database""" ## TODO(dconover): depending on self.paperdb_state update paperdata ## can self.status_type be replaced with self.paperdb_state? ## TODO(dconover): implement self.status_type; update paperdb_state="{}{}".format(self.status_type, self.pid) ## TODO(dconover): close database; implement self.db_close() pass
class RamTar(object): """handling python tarfile opened directly against tape devices""" def __init__(self, pid, drive_select=1, rewrite_path=None, debug=False, debug_threshold=128): """initialize""" self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.drive_select = drive_select self.rewrite_path = rewrite_path ## if we're not using disk queuing we open the drives differently; ## we need to track different states ## for faster archiving we keep some data in memory instead of queuing to disk self.archive_bytes = BytesIO() self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes) self.archive_info = tarfile.TarInfo() ## tape opened with tar ## this is a dictionary where we will do: ## self.tape_drive[drive_int] = tarfile.open(mode='w:') self.tape_filehandle = {} self.tape_drive = {} ## if we use tarfile, we need to track the state self.drive_states = RamTarStateCode self.drive_state = self.ramtar_tape_drive(drive_select, self.drive_states.drive_init) def ramtar_tape_drive(self, drive_int, request): """open, close, update state, or reserve a drive for another process :rtype : Enum """ self.debug.output('reqeust - {}'.format(request)) action_return = [] ## TODO(dconover): prly don't need this? def init_tar_drive(): """Mark the given drives as available """ new_state = {} if int(drive_int) == 2: self.debug.output('drive_select==2') for _loop_drive_int in 0, 1: new_state[_loop_drive_int] = self.drive_states.drive_init else: self.debug.output('init single - {}'.format(drive_int)) reserve_drive = 0 if drive_int == 1 else 1 new_state[drive_int] = self.drive_states.drive_init new_state[reserve_drive] = self.drive_states.drive_reserve return new_state def open_tar_drive(): """open a tar file against a particular drive""" if int(drive_int) == 2: for _loop_int in 0, 1: ## define the actual device path device_path = '/dev/nst{}'.format(drive_int) if self.drive_state[ drive_int] is self.drive_states.drive_init: self.debug.output('open tar on {}'.format(device_path)) ## create a filehandle for the device self.tape_filehandle[drive_int] = open(device_path, mode='wb') ## send the filehandle to the tarfile self.tape_drive[drive_int] = tarfile.open( fileobj=self.tape_filehandle[drive_int], mode='w:') self.drive_state[ drive_int] = self.drive_states.drive_open else: self.debug.output('Fail to open {}:{}'.format( device_path, self.drive_state[drive_int])) else: self.debug.output( 'called with drive_int=={}'.format(drive_int)) device_path = '/dev/nst{}'.format(drive_int) if drive_int in self.drive_state and self.drive_state[ drive_int] is self.drive_states.drive_init: self.debug.output('open tar on {}'.format(device_path)) ## create a filehandle for the device self.tape_filehandle[drive_int] = open(device_path, mode='wb') self.tape_drive[drive_int] = tarfile.open( fileobj=self.tape_filehandle[drive_int], mode='w:') self.drive_state[drive_int] = self.drive_states.drive_open else: self.debug.output('Fail to open {}'.format(device_path)) self.debug.output('state={}'.format(self.drive_state)) return self.drive_state def close_tar_drive(): """close a previously opened tar for a particular drive""" if self.drive_state[drive_int] is self.drive_states.drive_open: ## close tarfile self.tape_drive[drive_int].close() ## close tape_filehandle self.tape_filehandle[drive_int].close() self.drive_state[drive_int] = self.drive_states.drive_init self.debug.output('closed drive_int={}'.format(drive_int)) else: self.debug.output('Fail to close drive_int={} ({})'.format( drive_int, self.drive_state[drive_int])) action = { self.drive_states.drive_init: init_tar_drive, self.drive_states.drive_open: open_tar_drive, self.drive_states.drive_close: close_tar_drive } try: action_return = action[request]() self.debug.output('action_return = {}'.format(action_return)) except Exception as action_exception: self.debug.output('tar_exception: {}'.format(action_exception)) raise return action_return def archive_from_list(self, tape_list): """take a tape list, build each archive, write to tapes""" archive_dict = defaultdict(list) archive_list_dict = defaultdict(list) if self.drive_select == 2: self.debug.output('writing data to two tapes') ## for archive group in list ## build a dictionary of archives for item in tape_list: self.debug.output('item to check: {}'.format(item)) archive_list_dict[item[0]].append(item) archive_dict[item[0]].append(item[-1]) for tape_index in archive_dict: data_dir = '/papertape' archive_dir = '/papertape/queue/{}'.format(self.pid) archive_prefix = 'paper.{}.{}'.format(self.pid, tape_index) archive_name = '{}.tar'.format(archive_prefix) archive_file = '{}/{}'.format(archive_dir, archive_name) archive_list = '{}/{}.file_list'.format( archive_dir, archive_prefix) ## rewind the archive to zero to we don't fill up ram self.archive_bytes = BytesIO() self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes) ## for file in archive group build archive for item in archive_dict[tape_index]: self.debug.output('item - {}..{}'.format(tape_index, item)) #arcname_rewrite = self.rewrite_path data_path = '/'.join([data_dir, item]) ## TODO(dconover): remove excess leading paths from archive_path archive_path = '/'.join([archive_prefix, item]) self.append_to_archive(data_path, file_path_rewrite=archive_path) ## close the file but not the bytestream self.archive_tar.close() arc = open(archive_file, mode='w') arc.close() ## send archive group to both tapes for drive in [0, 1]: self.debug.output('send data') self.send_archive_to_tape(drive, archive_list, archive_name, archive_file) else: ## I don't think its a good idea to do this since you have to read the data twice self.debug.output('skipping data write') pass def append_to_archive(self, file_path, file_path_rewrite=None): """add data to an open archive""" arcname = file_path if file_path_rewrite is None else file_path_rewrite try: self.debug.output('file_path={}, arcname={}'.format( file_path, arcname)) self.archive_tar.add(file_path, arcname=arcname) except Exception as cept: self.debug.output('tarfile exception - {}'.format(cept)) raise def send_archive_to_tape(self, drive_int, archive_list, archive_name, archive_file): """send the current archive to tape""" try: self.debug.output('{}'.format(archive_name)) self.ramtar_tape_drive(drive_int, self.drive_states.drive_open) self.debug.output('{}'.format(self.drive_state)) ## add archive_list self.tape_drive[drive_int].add(archive_list) ## get the basic info from the blank file we wrote self.archive_info = self.tape_drive[drive_int].gettarinfo( archive_file) ## fix the size to the byte size of our BytesIO object self.archive_info.size = len(self.archive_bytes.getvalue()) ## rewind self.archive_bytes.seek(0) ## write the bytes with info to the tape self.tape_drive[drive_int].addfile(tarinfo=self.archive_info, fileobj=self.archive_bytes) self.ramtar_tape_drive(drive_int, self.drive_states.drive_close) self.archive_bytes.seek(0) except Exception as cept: self.debug.output('tarfile - {}'.format(cept)) raise def reset_archive(self): """reset the archive""" self.archive_bytes.seek(0) self.archive_bytes.truncate() self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes)
class Drives(object): """class to manage low level access directly with tape (equivalient of mt level commands) It also can handle python directly opening or more drives with tar. It assumes that exactly two drives are installed, and that you will use either one, or both via the tape_select option """ def __init__(self, pid, drive_select=2, debug=False, disk_queue=True, debug_threshold=128): """initialize debugging and pid""" self.pid = pid self.debug = Debug(pid, debug=debug, debug_threshold=debug_threshold) self.drive_select = drive_select ## This method is deprecated because the tape self check runs though every listed archive def count_files(self, drive_int): """count the number of files on the current tape in the given drive""" drive = "/dev/nst%s" % drive_int bash_to_count_files = """ _count_files_on_tape () { ## count the number of files on tape local _count=0; while :; do mt -f /dev/nst0 fsf 1 ||break let _count+=1 done echo $_count } """ output = check_output(bash_to_count_files, shell=True).decode('utf8').split('\n') return int(output[0]) def tar_files(self, files): """send files in a file_list to drive(s) with tar""" commands = [] for drive_int in range(self.drive_select): commands.append('tar cf /dev/nst%s %s ' % (drive_int, ' '.join(files))) self.exec_commands(commands) def tar_fast(self, files): """send catalog file and file_list of source files to tape as archive""" def tar(self, file_name): """send the given file_name to a drive(s) with tar""" commands = [] for drive_int in range(self.drive_select): commands.append('tar cf /dev/nst%s %s ' % (drive_int, file_name)) self.exec_commands(commands) def dd(self, text_file): """write text contents to the first 32k block of a tape""" commands = [] for drive_int in range(self.drive_select): commands.append('dd conv=sync,block of=/dev/nst%s if=%s bs=32k' % (drive_int, text_file)) self.exec_commands(commands) def dd_read(self, drive_int): """assuming a loaded tape, read off the first block from the tape and return it as a string""" command = [ 'dd', 'conv=sync,block', 'if=/dev/nst%s' % drive_int, 'bs=32k' ] self.debug.output('%s' % command) output = check_output(command).decode('utf8').split('\n') return output[:-1] def dd_duplicate(self, source_drive_int, destination_drive_int): """copy a tape from one drive to the other using dd""" source_dev = 'if=/dev/nst{}'.format(source_drive_int) destination_dev = 'of=/dev/nst{}'.format(destination_drive_int) command = ['dd', 'conf=sync,block', source_dev, destination_dev] self.debug.output('{}'.format(command)) output = check_output(command).decode('utf8').split('\n') def md5sum_at_index(self, job_pid, tape_index, directory_path, drive_int=0): """given a tape_index and drive_int, return the md5sum of the file at that index on the tape in /dev/nst$drive_index.""" self.debug.output("getting md5 of file at %s in drive %s" % (tape_index, drive_int)) commands = [] ## the index is stored like: [PAPR1001, PAPR2001]-0:1 ## the first number gives the file on tape ## the second number gives the file on tar ## but the tar is inside another tar with the full file table ## to get at an indexed file you must do something like: ## bash_to_md5_selected_file = """ _block_md5_file_on_tape () { local _fsf=1 local _job_pid=${1:-030390297} local _tape_index=${2:-1} local _test_path=${3:-data-path} local _tape_dev=${4:-0} local _tar_number=$_tape_index local _archive_tar=papertape/shm/paper.$_job_pid.$_tar_number.tar local _test_file=$_test_path/visdata ## extract the archive tar, then extract the file to stdout, then run md5 on stdin mt -f /dev/nst$_tape_dev fsf $_fsf && tar xOf /dev/nst$_tape_dev $_archive_tar| tar xOf - paper.$_job_pid.$_tape_index/$_test_file| md5sum|awk '{print $1}' } _block_md5_file_on_tape %s %s %s %s """ % (job_pid, tape_index, directory_path, drive_int) #self.debug.output(bash_to_md5_selected_file, debug_level=252) self.debug.output("reading %s" % directory_path) try: ## check output output = check_output(bash_to_md5_selected_file, shell=True).decode('utf8').split('\n') ## we should check the output self.debug.output('output: %s' % output[0], debug_level=250) except CalledProcessError as return_info: self.debug.output('return_info: %s' % return_info) return False return output[0] def exec_commands(self, cmds): """ Exec commands in parallel in multiple process (as much as we have CPU) """ if not cmds: return # empty file_list def done(proc): self.debug.output('process done') return proc.poll() is not None def success(proc): self.debug.output('process success') return proc.returncode == 0 def fail(): self.debug.output('process fail, now what?') return processes = [] while True: while cmds: task = cmds.pop() processes.append(Popen(task, shell=True)) for process in processes: self.debug.output('{}'.format(process.args)) if done(process): if success(process): processes.remove(process) else: fail() ## if we don't remove the process it will loop infinitely ## on the other hand if we proceed without escalating the failure properly, ## we'll probably keep running into the same problem with subsequent runs ## TODO(dconover): update return value to terminate dump cleanly ## processes.remove(process) if not processes and not cmds: self.debug.output('break') break else: self.debug.output('sleep', debug_level=250) time.sleep(5)
class Changer(object): """simple tape changer class""" def __init__(self, version, pid, tape_size, disk_queue=True, drive_select=2, debug=False, debug_threshold=255): """init with debugging :type drive_select: int :param drive_select: 0 = nst0, 1 = nst1, 2 = nst{1,2} :type disk_queue: bool :param disk_queue: write archives to a disk queue first? """ self.version = version self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.tape_size = tape_size self._tape_dev = '/dev/changer' self.status_code = StatusCode self.drive_select = drive_select self.archive_tar = '' self.drive_ids = [] self.tape_ids = [] self.label_in_drive = [] ## return label in given drive self.check_inventory() self.tape_drives = Drives(self.pid, drive_select=drive_select, debug=debug, debug_threshold=debug_threshold) self.disk_queue = disk_queue if not self.disk_queue: ## we need to use Ramtar self.ramtar = FastTar(pid, drive_select=drive_select, rewrite_path=None, debug=debug, debug_threshold=debug_threshold) ## TODO(dconover): implement a lock on the changer to prevent overlapping requests self.changer_state = 0 def check_inventory(self): """check the current inventory of the library with mtx""" output = check_output(['mtx', 'status']).decode("utf-8") self.debug.output(output, debug_level=251) self.drive_ids, self.tape_ids, self.label_in_drive = split_mtx_output( output) for drive_id in self.drive_ids: self.debug.output( '- %s, %s num_tapes: %d' % (id, self.drive_ids[drive_id], len(self.tape_ids))) def print_inventory(self): """print out the current tape library inventory""" for drive_id in self.drive_ids: print('drive: %s, %s' % (id, self.drive_ids[drive_id])) for drive_id in self.tape_ids: print('slot: %s, %s' % (id, self.tape_ids[drive_id])) def get_tape_slot(self, tape_id): """return the slot number where the given tape is currently loaded""" return self.tape_ids[tape_id] def load_tape_pair(self, tape_ids): """load the next available tape pair""" load_tape_pair_status = True if len(tape_ids) == 2: for drive, tape_id in enumerate(tape_ids): if load_tape_pair_status is True: self.debug.output('loading', str(id), str(drive)) load_tape_pair_status = self.load_tape_drive(tape_id, drive=drive) else: self.debug.output( 'load failure for tape_id - {}'.format(tape_id)) else: self.debug.output('failed to load tape pair: %s' % tape_ids) load_tape_pair_status = False return load_tape_pair_status ## using type hinting with Sphinx ## pycharm doesn't seem to like PEP 3107 style type hinting def load_tape_drive(self, tape_id, drive=0): """load a given tape_id into a given drive=drive_int, unload if necessary. :type tape_id: label of tape to load :param tape_id: label of tape to load""" status = False self.debug.output('check then load - {}, {}'.format(tape_id, drive)) for attempt in range(3): if self.drives_empty(drive_int=drive): self.debug.output('calling load_tape - ', str(tape_id), str(drive), debug_level=128) self.load_tape(tape_id, drive) status = True break ## return if the drive already contains the tape we want ## just rewind elif str(drive) in self.label_in_drive and self.label_in_drive[str( drive)] == tape_id: ## if we call this function we probably need a rewind self.debug.output('tape loaded; rewinding tape - {}:{}'.format( str(drive), tape_id)) self.rewind_tape(tape_id) status = True break ## if the drive is full attempt to unload, then retry else: self.debug.output( 'different tape loaded, unloading - {}:{}'.format( str(self.label_in_drive), str(drive)), debug_level=128) self.unload_tape_drive(drive) return status def unload_tape_pair(self): """unload the tapes in the current drives""" if not self.drives_empty(): for tape_id in self.drive_ids: self.debug.output('unloading', tape_id) self.unload_tape(tape_id) def unload_tape_drive(self, tape_int): """unload the tapes in the current drives""" self.debug.output('unloading {}'.format(tape_int)) if not self.drives_empty(drive_int=tape_int): self.debug.output('unloading {} from {}'.format( self.label_in_drive[str(tape_int)], tape_int)) self.unload_tape(self.label_in_drive[str(tape_int)]) else: self.debug.output('tape already empty', str(tape_int)) def drives_empty(self, drive_int=None): """return true if the drives are currently empty""" self.debug.output('recheck inventory') self.check_inventory() if drive_int is not None: self.debug.output('called for {} while loaded: {}'.format( drive_int, self.label_in_drive)) if str(drive_int) in self.label_in_drive: self.debug.output('drive loaded already - {} with {}'.format( drive_int, self.label_in_drive[str(drive_int)])) return False else: self.debug.output('drive_empty') return True else: ## TODO(dconover): what's this now? self.debug.output('basic check drive labels: %s' % self.label_in_drive) return not len(self.drive_ids) @property def drives_loaded(self): """return true if the drives are loaded""" self.check_inventory() if len(self.drive_ids): return self.get_drive_tape_ids() else: return False def get_drive_tape_ids(self): """get the tape_ids currently loaded in drives""" self.check_inventory() return self.drive_ids def load_tape(self, tape_id, tape_drive): """Load a tape into a free drive slot""" load_tape_status = True try: if self.tape_ids[tape_id]: self.debug.output('Loading - %s' % tape_id) output = check_output([ 'mtx', 'load', str(self.tape_ids[tape_id]), str(tape_drive) ]) self.check_inventory() except KeyError: self.debug.output('tape not in storage - {}'.format(tape_id)) load_tape_status = False return load_tape_status def unload_tape(self, tape_id): """Unload a tape from a drive and put in the original slot""" if self.drive_ids[tape_id]: command = [ 'mtx', 'unload', self.drive_ids[tape_id][1], self.drive_ids[tape_id][0] ] self.debug.output('%s' % command) output = check_output(command) self.check_inventory() else: self.debug.output('tape_id({}) not in drive'.format(tape_id)) def rewind_tape(self, tape_id): """rewind the tape in the given drive""" status = False try: if self.drive_ids[tape_id]: self.debug.output('rewinding tape %s' % tape_id) output = check_output('mt -f /dev/nst%s rewi' % (self.drive_ids[tape_id][0]), shell=True) status = True except CalledProcessError: self.debug.output('rewind error') except KeyError: self.debug.output('tape (%s) not loaded: %s' % (tape_id, self.drive_ids)) return status def write(self, tape_index, tape_list=None): """write data to tape""" ## tar dir to two drives arcname = "paper.%s.%s" % (self.pid, tape_index) tar_name = "/papertape/queue/%s/%s.tar" % (self.pid, arcname) catalog_name = "/papertape/queue/%s/%s.file_list" % (self.pid, arcname) if self.disk_queue: self.debug.output("writing", catalog_name, tar_name) self.tape_drives.tar_files([catalog_name, tar_name]) elif self.disk_queue and tape_list: ## what should we do with a disk queue and a tape_list? self.debug.output('disk queue with tape_list given') raise Exception # self.ramtar.send_archive_to_tape() elif not self.disk_queue and tape_list: ## write a fast archive to disk using the given list of files ##self.ramtar.archive_from_list(tape_list) self.debug.output('unnecessary call to write?') elif not self.disk_queue and not tape_list: self.debug.output('no list given') raise Exception def prep_tape(self, catalog_file): """write the catalog to tape. write all of our source code to the first file""" ## write catalog self.debug.output("writing catalog to tape", catalog_file) self.tape_drives.dd(catalog_file) ## write source code #self.tape_drives.tar('/root/git/papertape') def read_tape_catalog(self, tape_id): """read and return first block of tape""" self.rewind_tape(tape_id) drive_int = self.drive_ids[tape_id][0] return self.tape_drives.dd_read(drive_int) def count_files(self, tape_id): """count files of the given tape""" self.rewind_tape(tape_id) drive_int = self.drive_ids[tape_id][0] return self.tape_drives.count_files(drive_int) def tape_archive_md5(self, tape_id, job_pid, catalog_list, md5_dict, drive=0): """loop through each archive on tape and check a random file md5 from each :rtype : bool""" ## default to True tape_archive_md5_status = self.status_code.OK reference = None self.debug.output('loading tape: %s' % tape_id) ## load a tape or rewind the existing tape self.load_tape_drive(tape_id, drive) drive_int = self.drive_ids[tape_id][0] ## for every tar advance the tape ## select a random path from the tape ## run md5sum_at_index(tape_index, drive_int=0) archive_dict = defaultdict(list) ## build a dictionary of archives for item in catalog_list: self.debug.output('item to check: %s' % item) archive_dict[item[0]].append(item[-1]) for tape_index in archive_dict: directory_path = random.choice(archive_dict[tape_index]) ## starting at the beginning of the tape we can advance one at a ## time through each archive and test one directory_path/visdata md5sum self.debug.output('checking md5sum for %s' % directory_path) md5sum = self.tape_drives.md5sum_at_index(job_pid, tape_index, directory_path, drive_int=drive_int) if md5sum != md5_dict[directory_path]: self.debug.output('mdsum does not match: %s, %s' % (md5sum, md5_dict[directory_path])) tape_archive_md5_status = self.status_code.tape_archive_md5_mismatch reference = ":".join([str(tape_index), directory_path]) break else: self.debug.output('md5 match: %s|%s' % (md5sum, md5_dict[directory_path])) self.unload_tape(tape_id) return tape_archive_md5_status, reference def close_changer(self): """cleanup""" ## TODO(dconover): implement changer locking; remove lock pass def append_to_archive(self, file_path, file_path_rewrite=None): """add data to an open archive""" arcname = file_path if file_path_rewrite is None else file_path_rewrite try: self.debug.output('file_path={}, arcname={}'.format( file_path, arcname)) self.archive_tar.add(file_path, arcname=arcname) except Exception as cept: self.debug.output('tarfile exception - {}'.format(cept)) raise def archive_from_list(self, tape_list): """take a tape list, build each archive, write to tapes""" archive_dict = defaultdict(list) archive_list_dict = defaultdict(list) if self.drive_select == 2: self.debug.output('writing data to two tapes') ## for archive group in list ## build a dictionary of archives for item in tape_list: self.debug.output('item to check: {}'.format(item)) archive_list_dict[item[0]].append(item) archive_dict[item[0]].append(item[-1]) for tape_index in archive_dict: data_dir = '/papertape' archive_dir = '/papertape/queue/{}'.format(self.pid) archive_prefix = 'paper.{}.{}'.format(self.pid, tape_index) archive_name = '{}.tar'.format(archive_prefix) #archive_file = '{}/{}'.format(archive_dir,archive_name) archive_file = '{}/shm/{}'.format(data_dir, archive_name) archive_list = '{}/{}.file_list'.format( archive_dir, archive_prefix) self.archive_tar = tarfile.open(archive_file, mode='w:') ## for file in archive group build archive for item in archive_dict[tape_index]: self.debug.output('item - {}..{}'.format(tape_index, item)) #arcname_rewrite = self.rewrite_path data_path = '/'.join([data_dir, item]) ## TODO(dconover): remove excess leading paths from archive_path archive_path = '/'.join([archive_prefix, item]) self.append_to_archive(data_path, file_path_rewrite=archive_path) ## close the file self.archive_tar.close() ## send archive group to both tapes self.debug.output('send data') self.send_archive_to_tape(archive_list, archive_name, archive_file) else: ## I don't think its a good idea to do this since you have to read the data twice self.debug.output('skipping data write') pass def send_archive_to_tape(self, archive_list, archive_name, archive_file): """send the current archive to tape""" try: self.debug.output('{}'.format(archive_name)) ## add archive_list, and archive_file self.tape_drives.tar_files([archive_list, archive_file]) ## truncate the current archive to save disk space archive_open = open(archive_file, 'w') archive_open.truncate(0) except Exception as cept: self.debug.output('tarfile - {}'.format(cept)) raise
class MtxDB(object): """db to handle record of label ids Field Type Null Key Default Extra id mediumint(9) NO PRI NULL auto_increment label char(8) YES NULL date int(11) YES NULL status int(11) YES NULL capacity int(11) YES NULL """ def __init__(self, version, credentials, pid, debug=False, debug_threshold=255): """Initialize connection and collect file_list of tape_ids.""" self.version = version self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.status_code = StatusCode ## database variables self.connection_timeout = 90 self.connection_time = datetime.timedelta() self.credentials = credentials self.connect = '' self.cur = '' self.db_connect('init', credentials) self.mtxdb_state = 0 ## current dump state def __setattr__(self, attr_name, attr_value): """debug.output() when a state variable is updated""" class_name = self.__class__.__name__.lower() ## we always use the lowercase of the class_name in the state variable if attr_name == '{}_state'.format(class_name): ## debug whenever we update the state variable self.debug.output("updating: {} with {}={}".format( class_name, attr_name, attr_value)) super(self.__class__, self).__setattr__(attr_name, attr_value) def update_connection_time(self): """refresh database connection""" self.debug.output('updating connection_time') self.connection_time = datetime.datetime.now() def connection_time_delta(self): """return connection age""" self.debug.output('connection_time:%s' % self.connection_time) delta = datetime.datetime.now() - self.connection_time return delta.total_seconds() def db_connect(self, command=None, credentials=None): """connect to the database or reconnect an old session""" self.debug.output('input:%s %s' % (command, credentials)) self.credentials = credentials if credentials is not None else self.credentials time_delta = self.connection_timeout + 1 if command == 'init' else self.connection_time_delta( ) self.debug.output("time_delta:%s, timeout:%s" % (time_delta, self.connection_timeout)) if time_delta > self.connection_timeout: self.debug.output("setting connection %s %s" % (credentials, self.connection_timeout)) self.connect = pymysql.connect( read_default_file=self.credentials, connect_timeout=self.connection_timeout) self.cur = self.connect.cursor() self.update_connection_time() self.debug.output("connection_time:%s" % self.connection_time) def get_capacity(self, tape_id): select_sql = "select capacity from ids where id='%s'" % tape_id def select_ids(self): """select lowest matching id pairs""" self.db_connect() ids = [] for n in [0, 1]: select_sql = """select label from ids where date is null and label like 'H0C%d%s' order by label """ % (n + 1, "%") self.cur.execute(select_sql) #print(self.cur.fetchone()[0]) ids.append(self.cur.fetchone()[0]) return ids def insert_ids(self, ids): """Add new tape_ids to the mtxdb""" self.db_connect() for label_id in ids: insert_sql = "insert into ids (label) values('%s')" % label_id print(insert_sql) self.cur.execute(insert_sql) self.connect.commit() def claim_ids(self, ids): """Mark files in the database that are "claimed" by a dump process.""" self.db_connect() for tape_id in ids: claim_query = '''update ids set status="%s", description="Paper dump version:%s" where label="%s"''' % (self.pid, self.version, tape_id) self.debug.output(claim_query) self.cur.execute(claim_query) self.connect.commit() def date_ids(self, ids): """write the date of our completed run to tape""" date_ids_status = self.status_code.OK date = datetime.datetime.now().strftime('%Y%m%d-%H%M') self.db_connect() for tape_id in ids: self.debug.output('updating mtxdb: %s, %s' % (date, tape_id)) date_sql = 'update ids set date="%s" where label="%s"' % (date, tape_id) try: self.cur.execute(date_sql) except Exception as mysql_error: self.debug.output('error {}'.format(mysql_error)) date_ids_status = self.status_code.date_ids_mysql try: self.connect.commit() except Exception as mysql_error: self.debug.output('error {}'.format(mysql_error)) date_ids_status = self.status_code.date_ids_mysql return date_ids_status def write(self, src_directory): """take a path like /dev/shm/1003261778 and create a tar archive on two tapes""" self.update_unused_capacity() pass def update_unused_capacity(self, used=None): """Write out unused capacity to database.""" self.db_connect() pass def close_mtxdb(self): """cleanup mtxdb state """ ## TODO(dconover): dependent on self.mtx_state: claim/unclaim tapes; close mtxdb pass
class Archive(object): """Build file archives for tape dumps""" def __init__(self, version, pid, debug=False, debug_threshold=255, local_transfer=True): """Archive file and tar management :type version: int :type pid: basestring :type local_transfer: bool :type debug_threshold: int :type debug: bool :type self: object """ self.pid = pid self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold) self.version = version #self.transfer = LocalTransfer() if local_transfer else Transfer() self.transfer = LocalTransfer() if local_transfer else None dir_status, self.archive_copy_dir = self.ensure_dir( '/papertape/shm/%s/' % self.pid) dir_status, self.queue_dir = self.ensure_dir('/papertape/queue/%s/' % self.pid) if dir_status is not True: self.debug.output('data dir init failed') raise Exception self.catalog_name = "{0:s}/paper.{1:s}.file_list".format( self.queue_dir, self.pid) self.tape_ids_filename = "{0:s}/paper.{1:s}.tape_ids.file_list".format( self.queue_dir, self.pid) self.archive_list = [] ## working file_list of files to write self.tape_list = [] ## cumulative file_list of written files self.item_index = 0 ## number of file path index (human readable line numbers in catalog) self.archive_state = 0 ## current archive state def __setattr__(self, attr_name, attr_value): """debug.output() when a state variable is updated""" class_name = self.__class__.__name__.lower() ## we always use the lowercase of the class_name in the state variable if attr_name == '{}_state'.format(class_name): ## debug whenever we update the state variable self.debug.output("updating: {} with {}={}".format( class_name, attr_name, attr_value)) super(self.__class__, self).__setattr__(attr_name, attr_value) def ensure_dir(self, file_path): """make sure the directory exists creating it if necessary :param file_path: path to make if it doesn't already exist :type file_path: str """ ensure_dir_status = True dir_path = os.path.dirname(file_path) if not os.path.exists(dir_path): try: os.makedirs(dir_path) except Exception as error: self.debug.output('mkdir error {}'.format(error)) ensure_dir_status = False return ensure_dir_status, dir_path def build_archive(self, file_list, source_select=None): """Copy files to /dev/shm/$PID, create md5sum data for all files""" for file_name in file_list: transfer_path = '%s/%s' % (self.archive_copy_dir, file_name) self.debug.output("build_archive - %s" % file_name) get("/papertape/" + file_name, local_path=transfer_path, recursive=True) def gen_catalog(self, archive_catalog_file, file_list, tape_index): """create a catalog file_name""" self.debug.output("intermediate catalog: %s" % archive_catalog_file) # noinspection PyArgumentList with open(archive_catalog_file, mode='w') as cfile: archive_index = 1 self.archive_list = [] for file_name in file_list: self.debug.output('archive_list: %s %s %s' % (tape_index, archive_index, file_name), debug_level=249) self.archive_list.append( [tape_index, archive_index, file_name]) cfile.write("%s:%s:%s\n" % (tape_index, archive_index, file_name)) archive_index += 1 def gen_final_catalog(self, tape_catalog_file, tape_list, md5_dict): """create a catalog file in /papertape/queue/$pid/$pid.file_list :param tape_catalog_file: str :param tape_list: file_list of [int, int, string] """ self.debug.output('tape_list - %s' % tape_list) job_details = " ".join([ self.pid, "(version:", str(self.version), "on", datetime.datetime.now().strftime('%Y%m%d-%H%M') + ")", ]) preamble_lines = "\n".join([ "## Paper dump catalog:" + job_details, "## This tape contains files as listed below:", "## item_index:tape_index:archive_index:data_md5:dir_path(host:fullpath)\n" ]) self.item_index = 1 with open(tape_catalog_file, mode='w') as cfile: ## write a preamble to describe the contents cfile.write(preamble_lines) ## write the actual tape_list for file_path in tape_list: self.debug.output("%s - %s" % (tape_catalog_file, file_path)) self.debug.output("file_inf - %s, %s" % (self.item_index, file_path), debug_level=249) ## which archive on tape has the file_path tape_index = file_path[0] ## where on the archive is the file_path archive_index = file_path[1] ## what is the file_path file_path = file_path[2] ## what is the md5sum of the file_path/visdata_file data_md5 = md5_dict[file_path] ## We don't actually need the item_index; it is a convenience to the user ## when reading the catalog catalog_line = [ self.item_index, tape_index, archive_index, data_md5, file_path ] output = ':'.join(str(x) for x in catalog_line) + "\n" ## write the tape_catalog to a file cfile.write(output) self.item_index += 1 self.item_index -= 1 def final_from_file(self, catalog=None, tape_ids=False): """gen final catalog from file_name""" self.archive_list = [] md5_dict = {} pid = '' item_index = 0 ## catalog includes a human readable preamble with dump info ## and numbered lines of items like: ## "item_index:tape_index:archive_index:visdata_md5sum:directory_path" header_line = re.compile('## Paper dump catalog:([0-9]+)') catalog_line = re.compile( '([0-9]+):([0-9]+):([0-9]+):([a-f0-9]{32}):(.*)') if catalog: self.debug.output('reading from string') catalog_lines = catalog else: ## read from file_name self.debug.output('reading from file_name') with open(self.catalog_name, mode='r') as file_name: catalog_lines = file_name.readlines() for line in catalog_lines: if catalog_line.match(line): ## split the line into groups catalog_info = catalog_line.match(line).groups() ## the first number is mostly for human consumption item_index = int(catalog_info[0]) ## the original catalog looks like the last three entries tape_index = int(catalog_info[1]) archive_index = int(catalog_info[2]) file_path = catalog_info[4] md5_dict[file_path] = catalog_info[3] catalog_list = [tape_index, archive_index, file_path] self.archive_list.append(catalog_list) elif header_line.match(line): self.debug.output('found header line') pid = header_line.match(line).groups()[0] return item_index, self.archive_list, md5_dict, pid def queue_archive(self, tape_index, file_list): """move the archive from /dev/shm to a tar file in the queue directory once we have 1.5tb of data we will create a catalog and write all the queued archives to tape. """ arcname = "%s.%s.%s" % ('paper', self.pid, tape_index) tar_name = "%s/%s.tar" % (self.queue_dir, arcname) catalog_name = "%s/%s.file_list" % (self.queue_dir, arcname) ## make the tar in the queue_directory self.tar_archive(self.archive_copy_dir, arcname, tar_name) ## make room for additional transfers self.rm_archive_copy_dir_list(file_list) ## make the catalog self.gen_catalog(catalog_name, file_list, tape_index) def tar_fast_archive(self, tape_id, file_list): """send tar of file chunks directly to tape.""" arcname = "%s.%s.%s" % ('paper', self.pid, tape_id) tar_name = "%s/%s.tar" % (self.queue_dir, arcname) catalog_name = "%s/%s.file_list" % (self.queue_dir, arcname) ## make the tar in the queue_directory self.tar_archive(self.archive_copy_dir, arcname, tar_name) ## make the catalog self.gen_catalog(catalog_name, file_list, tape_id) def rm_archive_copy_dir_list(self, file_list): """remove the given directory tree of files that have been copied into the temporary archive_copy_dir :param file_list: file_list of files :type file_list: list """ for dir_path in file_list: shutil.rmtree('%s/%s' % (self.archive_copy_dir, dir_path)) def tar_archive(self, source, arcname, destination): """create the queued tar for the archive file""" archive_file = tarfile.open(destination, mode='w') archive_file.add(source, arcname=arcname) archive_file.close() def md5(self, directory_prefix, file_path): """return an md5sum for a file""" full_path = '%s/%s' % (directory_prefix, file_path) hasher = hashlib.md5() with open('{}.md5sum'.format(full_path), mode='w') as hash_file: with open(full_path, mode='rb') as open_file: file_buffer = open_file.read() hasher.update(file_buffer) hash_file.write('%s\n' % hasher.hexdigest()) return hasher.hexdigest def save_tape_ids(self, tape_ids): """open a file and write the tape ids in case writing to the db fails""" self.debug.output('saving {0:s} to {1:s}'.format( tape_ids, self.tape_ids_filename)) tape_id_file = open(self.tape_ids_filename, mode='w') tape_id_file.write("[{0:s}]\n".format(tape_ids)) tape_id_file.close() def tape_ids_from_file(self): """Assuming you init from queued run, read in the tape ids from the tape_ids_file""" tape_ids = '' tape_id_line = re.compile("\[(.*)\]") self.debug.output('{0:s}'.format(self.tape_ids_filename), debug_level=128) with open(self.tape_ids_filename, mode='r') as tape_id_file: self.debug.output("opening_file", debug_level=128) for line in tape_id_file: self.debug.output('{0:s}'.format(line), debug_level=240) if tape_id_line.match(line): tape_info = tape_id_line.match(line).groups() tape_ids = tape_info[0] id_list = tape_ids.split(",") return id_list def close_archive(self): """release any locks from the changer""" pass