예제 #1
0
    def __init__(self,
                 pid,
                 drive_select=1,
                 rewrite_path=None,
                 debug=False,
                 debug_threshold=128):
        """initialize"""

        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.drive_select = drive_select
        self.rewrite_path = rewrite_path
        ## if we're not using disk queuing we open the drives differently;
        ## we need to track different states
        ## for faster archiving we keep some data in memory instead of queuing to disk
        self.archive_bytes = BytesIO()
        self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes)
        self.archive_info = tarfile.TarInfo()

        ## tape opened with tar
        ## this is a dictionary where we will do:
        ## self.tape_drive[drive_int] = tarfile.open(mode='w:')
        self.tape_filehandle = {}
        self.tape_drive = {}

        ## if we use tarfile, we need to track the state
        self.drive_states = RamTarStateCode
        self.drive_state = self.ramtar_tape_drive(drive_select,
                                                  self.drive_states.drive_init)
예제 #2
0
    def __init__(self, version, credentials, pid, debug=False, debug_threshold=255):
        """Initialize connection and collect file_list of files to dump.
        :type version: int
        :type credentials: string
        :type pid: basestring
        :type debug: bool
        :type debug_threshold: int
        """

        self.pid = pid
        self.version = version
        self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold)
        self.status_code = StatusCode

        self.paperdb_state_code = PaperDBStateCode
        self.paperdb_state = self.paperdb_state_code.initialize
        self.connection_timeout = 90
        self.connection_time = timedelta()

        self.check_credentials_file(credentials)
        self.credentials = credentials
        self.connect = ''
        self.cur = ''
        self.db_connect('init', credentials)

        self.file_list = []
        self.file_md5_dict = {}
        self.claimed_files = []
        self.claimed_state = 0
예제 #3
0
 def __init__(self,
              pid,
              drive_select=2,
              debug=False,
              disk_queue=True,
              debug_threshold=128):
     """initialize debugging and pid"""
     self.pid = pid
     self.debug = Debug(pid, debug=debug, debug_threshold=debug_threshold)
     self.drive_select = drive_select
예제 #4
0
    def __init__(self,
                 version,
                 pid,
                 tape_size,
                 disk_queue=True,
                 drive_select=2,
                 debug=False,
                 debug_threshold=255):
        """init with debugging
        :type drive_select: int
        :param drive_select: 0 = nst0, 1 = nst1, 2 = nst{1,2}
        :type disk_queue: bool
        :param disk_queue: write archives to a disk queue first?
        """

        self.version = version
        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)
        self.tape_size = tape_size
        self._tape_dev = '/dev/changer'
        self.status_code = StatusCode
        self.drive_select = drive_select
        self.archive_tar = ''

        self.drive_ids = []
        self.tape_ids = []
        self.label_in_drive = []  ## return label in given drive

        self.check_inventory()
        self.tape_drives = Drives(self.pid,
                                  drive_select=drive_select,
                                  debug=debug,
                                  debug_threshold=debug_threshold)

        self.disk_queue = disk_queue
        if not self.disk_queue:
            ## we need to use Ramtar
            self.ramtar = FastTar(pid,
                                  drive_select=drive_select,
                                  rewrite_path=None,
                                  debug=debug,
                                  debug_threshold=debug_threshold)
        ## TODO(dconover): implement a lock on the changer to prevent overlapping requests
        self.changer_state = 0
예제 #5
0
    def __init__(self,
                 version,
                 pid,
                 debug=False,
                 debug_threshold=255,
                 local_transfer=True):
        """Archive file and tar management

        :type version: int
        :type pid: basestring
        :type local_transfer: bool
        :type debug_threshold: int
        :type debug: bool
        :type self: object
        """

        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.version = version
        #self.transfer = LocalTransfer() if local_transfer else Transfer()
        self.transfer = LocalTransfer() if local_transfer else None

        dir_status, self.archive_copy_dir = self.ensure_dir(
            '/papertape/shm/%s/' % self.pid)
        dir_status, self.queue_dir = self.ensure_dir('/papertape/queue/%s/' %
                                                     self.pid)

        if dir_status is not True:
            self.debug.output('data dir init failed')
            raise Exception

        self.catalog_name = "{0:s}/paper.{1:s}.file_list".format(
            self.queue_dir, self.pid)
        self.tape_ids_filename = "{0:s}/paper.{1:s}.tape_ids.file_list".format(
            self.queue_dir, self.pid)
        self.archive_list = []  ## working file_list of files to write
        self.tape_list = []  ## cumulative file_list of written files
        self.item_index = 0  ## number of file path index (human readable line numbers in catalog)
        self.archive_state = 0  ## current archive state
예제 #6
0
    def __init__(self,
                 version,
                 credentials,
                 pid,
                 debug=False,
                 debug_threshold=255):
        """Initialize connection and collect file_list of tape_ids."""

        self.version = version
        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)
        self.status_code = StatusCode

        ## database variables
        self.connection_timeout = 90
        self.connection_time = datetime.timedelta()
        self.credentials = credentials
        self.connect = ''
        self.cur = ''
        self.db_connect('init', credentials)

        self.mtxdb_state = 0  ## current dump state
예제 #7
0
class FastTar(RamTar):
    """handling python tarfile opened directly against tape devices"""
    def __init__(self,
                 pid,
                 drive_select=1,
                 rewrite_path=None,
                 debug=False,
                 debug_threshold=128):
        """initialize"""

        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.drive_select = drive_select
        self.rewrite_path = rewrite_path
        ## if we're not using disk queuing we open the drives differently;
        ## we need to track different states
        ## for faster archiving we keep some data in memory instead of queuing to disk
        self.archive_tar = ''

        ## tape opened with tar
        ## this is a dictionary where we will do:
        ## self.tape_drive[drive_int] = tarfile.open(mode='w:')
        self.tape_filehandle = {}
        self.tape_drive = {}

        ## if we use tarfile, we need to track the state
        self.drive_states = RamTarStateCode
        self.drive_state = self.ramtar_tape_drive(drive_select,
                                                  self.drive_states.drive_init)

    def send_archive_to_tape(self, drive_int, archive_list, archive_name,
                             archive_file):
        """send the current archive to tape"""
        try:
            self.debug.output('{}'.format(archive_name))
            self.ramtar_tape_drive(drive_int, self.drive_states.drive_open)
            self.debug.output('{}'.format(self.drive_state))

            ## add archive_list
            self.tape_drive[drive_int].add(archive_list)

            ## write the tape
            #self.tape_drive[drive_int].add(archive_file)
            self.ramtar_tape_drive(drive_int, self.drive_states.drive_close)

            ## truncate the current archive to save disk space
            archive_open = open(archive_file, 'w')
            archive_open.truncate(0)

        except Exception as cept:
            self.debug.output('tarfile - {}'.format(cept))
            raise
예제 #8
0
    def __init__(self,
                 credentials,
                 debug=False,
                 pid=None,
                 disk_queue=True,
                 drive_select=2,
                 debug_threshold=255):
        """initialize"""

        self.version = __version__
        self.pid = "%0.6d%0.3d" % (getpid(),
                                   randint(1, 999)) if pid is None else pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.status_code = StatusCode
        self.mtx_creds = '~/.my.mtx.cnf'
        self.debug.output(credentials)
        self.paper_creds = credentials

        self.tape_ids = ''

        ## each dump process 6gb to /dev/shm (two at a time)
        self.batch_size_mb = 12000

        ## (1.5Tb -1 batch)
        self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb
        #self.tape_size = 13000

        ## setup PaperDB connection
        #self.paperdb = PaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold)
        ## test database
        self.paperdb = TestPaperDB(self.version,
                                   self.paper_creds,
                                   self.pid,
                                   debug=True,
                                   debug_threshold=debug_threshold)
        ## reload test data
        #self.paperdb.load_sample_data()

        ## setup tape library
        self.labeldb = MtxDB(self.version,
                             self.mtx_creds,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## setup file access
        self.files = Archive(self.version,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## use the pid here to lock changer
        self.drive_select = drive_select
        self.tape = Changer(self.version,
                            self.pid,
                            self.tape_size,
                            debug=True,
                            drive_select=drive_select,
                            disk_queue=disk_queue,
                            debug_threshold=debug_threshold)

        self.dump_list = []
        self.tape_index = 0
        self.tape_used_size = 0  ## each dump process should write one tape worth of data
        self.dump_state_code = DumpStateCode
        self.dump_state = self.dump_state_code.initialize
예제 #9
0
class Dump(object):
    """Coordinate a dump to tape based on deletable files in database"""
    def __init__(self,
                 credentials,
                 debug=False,
                 pid=None,
                 disk_queue=True,
                 drive_select=2,
                 debug_threshold=255):
        """initialize"""

        self.version = __version__
        self.pid = "%0.6d%0.3d" % (getpid(),
                                   randint(1, 999)) if pid is None else pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.status_code = StatusCode
        self.mtx_creds = '~/.my.mtx.cnf'
        self.debug.output(credentials)
        self.paper_creds = credentials

        self.tape_ids = ''

        ## each dump process 6gb to /dev/shm (two at a time)
        self.batch_size_mb = 12000

        ## (1.5Tb -1 batch)
        self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb
        #self.tape_size = 13000

        ## setup PaperDB connection
        #self.paperdb = PaperDB(self.version, self.paper_creds, self.pid, debug=True, debug_threshold=debug_threshold)
        ## test database
        self.paperdb = TestPaperDB(self.version,
                                   self.paper_creds,
                                   self.pid,
                                   debug=True,
                                   debug_threshold=debug_threshold)
        ## reload test data
        #self.paperdb.load_sample_data()

        ## setup tape library
        self.labeldb = MtxDB(self.version,
                             self.mtx_creds,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## setup file access
        self.files = Archive(self.version,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## use the pid here to lock changer
        self.drive_select = drive_select
        self.tape = Changer(self.version,
                            self.pid,
                            self.tape_size,
                            debug=True,
                            drive_select=drive_select,
                            disk_queue=disk_queue,
                            debug_threshold=debug_threshold)

        self.dump_list = []
        self.tape_index = 0
        self.tape_used_size = 0  ## each dump process should write one tape worth of data
        self.dump_state_code = DumpStateCode
        self.dump_state = self.dump_state_code.initialize

    def archive_to_tape(self):
        """master method to loop through files to write data to tape"""

        ## get a file_list of files, transfer to disk, write to tape
        while self.tape_used_size + self.batch_size_mb < self.tape_size:

            ## get a file_list of files to dump
            archive_list, archive_size = self.get_list(self.batch_size_mb)

            if archive_list:
                try:
                    ## copy files to b5, gen catalog file
                    self.files.build_archive(archive_list)

                    ## files to tar on disk with catalog
                    self.files.queue_archive(self.tape_index, archive_list)

                    ## mark where we are
                    self.dump_state = self.dump_state_code.dump_queue

                except Exception as error:
                    self.debug.output(
                        'archive build/queue error {}'.format(error))
                    self.close_dump()

                ## Files in these lists should be identical, but archive_list has extra data
                ## archive_list: [[0, 1, 'test:/testdata/testdir'], [0, 2, 'test:/testdata/testdir2'], ... ]
                ## archive_list: ['test:/testdata/testdir', 'test:/testdata/testdir2', ... ]
                self.debug.output('archive_list - %s' %
                                  self.files.archive_list)
                self.debug.output('file_list - %s' % archive_list)

                ## queue archive does the job of making the archive_list we need to update the tape_list
                self.files.tape_list.extend(self.files.archive_list)
                self.debug.output(
                    "q:%s l:%s t:%s" %
                    (self.tape_used_size, archive_size, self.tape_size))

                ## add archive_size to current tape_used_size
                self.tape_used_size += archive_size
                self.tape_index += 1

            else:
                ## we ran out of files
                self.debug.output('file file_list empty')
                break

        if self.tape_used_size > 0:
            self.debug.output(
                'sending queued files to tar - %s, %s' %
                (len(self.files.tape_list), self.files.tape_list))
            self.files.gen_final_catalog(self.files.catalog_name,
                                         self.files.tape_list,
                                         self.paperdb.file_md5_dict)
            if self.drive_select == 2:
                ## use two tape drives to write data at the same time
                self.debug.output('using two drives')
                self.tar_archive(self.files.catalog_name)
            else:
                ## use one drive to write to two tapes serially
                self.debug.output('using one drive')
                self.tar_archive_single(self.files.catalog_name)

        else:
            ## no files found
            self.debug.output('Abort - no files found')

        self.close_dump()

    def get_list(self, limit=7500, regex=False, pid=False, claim=True):
        """get a file_list less than limit size"""

        ## get a 7.5 gb file_list of files to transfer
        self.dump_list, list_size = self.paperdb.get_new(limit,
                                                         regex=regex,
                                                         pid=pid)

        ## claim the files so other jobs can request different files
        if self.dump_list and claim:
            self.debug.output(str(list_size))
            self.paperdb.claim_files(self.dump_list)
        return self.dump_list, list_size

    def tar_archive_single(self, catalog_file):
        """send archives to single tape drive using tar"""

        ## track how many copies are written
        tape_copy = 1
        tar_archive_single_status = self.status_code.OK

        ## select ids
        tape_label_ids = self.labeldb.select_ids()
        self.labeldb.claim_ids(tape_label_ids)

        ## load up a fresh set of tapes
        for label_id in tape_label_ids:
            self.debug.output('load tape', label_id, debug_level=128)
            self.tape.load_tape_drive(label_id)

            ## tar files to tape
            self.debug.output('prep tape', debug_level=128)
            self.tape.prep_tape(catalog_file)

            for tape_index in range(self.tape_index):
                self.debug.output('sending tar to single drive',
                                  str(tape_index),
                                  debug_level=225)
                try:
                    self.tape.write(tape_index)
                except Exception as error:
                    self.debug.output('tape write fail {}'.format(error))
                    self.close_dump()
                    break

            ## we have written two copies
            if tape_copy == 2:
                ## update the dump state
                self.dump_state = self.dump_state_code.dump_write

            dump_verify_status = self.dump_verify(label_id)
            if dump_verify_status is not self.status_code.OK:
                self.debug.output(
                    'Fail: dump_verify {}'.format(dump_verify_status))
                tar_archive_single_status = self.status_code.tar_archive_single_dump_verify
                self.close_dump()
                break

            if tape_copy == 2:
                self.dump_state = self.dump_state_code.dump_verify

            self.debug.output('unloading drive', label_id, debug_level=128)
            self.tape.unload_tape_drive(label_id)

            ## track tape copy
            tape_copy += 1

        ## update the current dump state
        if tar_archive_single_status is self.status_code.OK:
            log_label_ids_status = self.log_label_ids(tape_label_ids,
                                                      self.files.tape_list)
            if log_label_ids_status is not self.status_code.OK:
                self.debug.output('problem writing labels out: {}'.format(
                    log_label_ids_status))
        else:
            self.debug.output(
                "Abort dump: {}".format(tar_archive_single_status))

        self.close_dump()

    def log_label_ids(self, tape_label_ids, tape_list):
        """send label ids to db"""
        log_label_ids_status = self.status_code.OK
        log_label_ids_status = self.paperdb.write_tape_index(
            self.files.tape_list, ','.join(tape_label_ids))
        if log_label_ids_status is not self.status_code.OK:
            self.debug.output(
                'problem writing label: {}'.format(log_label_ids_status))
            self.files.save_tape_ids(','.join(tape_label_ids))

        log_label_ids_status = self.labeldb.date_ids(tape_label_ids)
        if log_label_ids_status is not self.status_code.OK:
            self.debug.output(
                'problem dating labels: {}'.format(log_label_ids_status))

        return log_label_ids_status

    def dump_verify(self, tape_id):
        """take the tape_id and run a self check,
        then confirm the tape_list matches

        """

        dump_verify_status = self.status_code.OK

        ## run a tape_self_check
        self_check_status, item_index, catalog_list, md5_dict, tape_pid = self.tape_self_check(
            tape_id)

        ## take output from tape_self_check and compare against current dump
        if self_check_status is self.status_code.OK:

            self.debug.output('confirming item_count {} == {}'.format(
                self.files.item_index, int(item_index)))
            if self.files.item_index != int(item_index):
                self.debug.output(
                    "%s mismatch: %s, %s" %
                    ("item_count", self.files.item_index, item_index))
                dump_verify_status = self.status_code.dump_verify_item_index

            self.debug.output('confirming %s' % "catalog")
            if self.files.tape_list != catalog_list:
                self.debug.output(
                    "%s mismatch: %s, %s" %
                    ("catalog", self.files.tape_list, catalog_list))
                dump_verify_status = self.status_code.dump_verify_catalog

            self.debug.output('confirming %s' % "md5_dict")
            if self.paperdb.file_md5_dict != md5_dict:
                self.debug.output(
                    "%s mismatch: %s, %s" %
                    ("md5_dict", self.paperdb.file_md5_dict, md5_dict),
                    debug_level=253)
                dump_verify_status = self.status_code.dump_verify_md5_dict

            self.debug.output('confirming %s' % "pid")
            if self.pid != str(tape_pid):
                self.debug.output("%s mismatch: %s, %s" %
                                  ("pid", self.pid, tape_pid))
                dump_verify_status = self.status_code.dump_verify_pid

        else:
            self.debug.output('Fail: tape_self_check_status: %s' %
                              self_check_status)
            return self_check_status

        self.debug.output('final {}'.format(dump_verify_status))
        return dump_verify_status

    def tape_self_check(self, tape_id):
        """process to take a tape and run integrity check without reference to external database

        :rtype : bool
        """
        tape_self_check_status = self.status_code.OK

        ## load the tape if necessary
        ## TODO(dconover): call with the correct tape drive_int or unload tape before tape_self_check
        self.tape.load_tape_drive(tape_id)

        ## read tape_catalog as file_list
        self.debug.output('read catalog from tape: %s' % tape_id)
        first_block = self.tape.read_tape_catalog(tape_id)

        ## parse the archive_list
        ## build an file_md5_dict
        item_index, catalog_list, md5_dict, tape_pid = self.files.final_from_file(
            catalog=first_block)

        tape_archive_md5_status, reference = self.tape.tape_archive_md5(
            tape_id, tape_pid, catalog_list, md5_dict)
        if tape_archive_md5_status is not self.status_code.OK:
            self.debug.output(
                "tape failed md5 inspection at index: %s, status: %s" %
                (reference, tape_archive_md5_status))
            tape_self_check_status = tape_archive_md5_status

        return tape_self_check_status, item_index, catalog_list, md5_dict, tape_pid

    def tar_archive(self, catalog_file):
        """send archives to tape drive pair using tar"""

        ## select ids
        tape_label_ids = self.labeldb.select_ids()
        self.labeldb.claim_ids(tape_label_ids)

        ## load up a fresh set of tapes
        self.tape.load_tape_pair(tape_label_ids)

        ## tar files to tape
        self.tape.prep_tape(catalog_file)
        for tar_index in range(self.tape_index):
            self.debug.output('sending to tape file - %s' % str(tar_index))
            try:
                self.tape.write(tar_index)
            except Exception as error:
                self.debug.output('tape writing exception {}'.format(error))
                break

        self.tape.unload_tape_pair()

        ## write tape locations
        self.debug.output('writing tape_indexes - %s' % self.files.tape_list)
        self.paperdb.write_tape_index(self.files.tape_list,
                                      ','.join(tape_label_ids))
        self.debug.output('updating mtx.ids with date')
        self.labeldb.date_ids(tape_label_ids)

    def close_dump(self):
        """orderly close of dump"""
        def _close_init():
            """simple cleanup"""
            pass

        def _close_list():
            """we have claimed files to cleanup"""
            self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim

        def _close_queue():
            """files are queued"""
            self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_queue

        def _close_write():
            """files written to tape"""
            self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_write

        def _close_verify():
            """files verified"""
            self.paperdb.paperdb_state = self.paperdb.paperdb_state_code.claim_verify

        close_action = {
            self.dump_state_code.initialize: _close_init,
            self.dump_state_code.dump_list: _close_list,
            self.dump_state_code.dump_queue: _close_queue,
            self.dump_state_code.dump_write: _close_write,
            self.dump_state_code.dump_verify: _close_verify,
        }

        ## prep cleanup state
        close_action[self.dump_state]()

        ## do module cleanup
        self.paperdb.close_paperdb()
        self.files.close_archive()
        self.labeldb.close_mtxdb()
        self.tape.close_changer()

        ## exit
        exit(self.dump_state.value)
예제 #10
0
class DumpFaster(DumpFast):
    """Queless archiving means that the data is never transferred to our disk queues

    Disk queues are still used to maintain state in the event of a partial dump failure
    Tape verification is rewritten to make use of python threading.

    """
    def __init__(self,
                 credentials='/papertape/etc/my.papertape-test.cnf',
                 mtx_credentials='/home2/obs/.my.mtx.cnf',
                 debug=False,
                 pid=None,
                 disk_queue=True,
                 drive_select=2,
                 debug_threshold=255):
        """initialize"""

        self.version = __version__
        self.pid = "%0.6d%0.3d" % (getpid(),
                                   randint(1, 999)) if pid is None else pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.status_code = StatusCode
        self.check_credentials_file(mtx_credentials)
        self.mtx_creds = mtx_credentials

        self.debug.output(credentials)
        self.check_credentials_file(credentials)
        self.paper_creds = credentials

        self.tape_ids = ''

        ## each dump process 12GB to /dev/shm (two at a time)
        self.batch_size_mb = 12000

        ## (1.5Tb -1 batch)
        self.tape_size = (1.5 * 1000 * 1000) - self.batch_size_mb
        #self.tape_size = 13000

        ## setup PaperDB connection
        self.paperdb = PaperDB(self.version,
                               self.paper_creds,
                               self.pid,
                               debug=True,
                               debug_threshold=debug_threshold)

        ## setup tape library
        self.labeldb = MtxDB(self.version,
                             self.mtx_creds,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## setup file access
        self.files = Archive(self.version,
                             self.pid,
                             debug=debug,
                             debug_threshold=debug_threshold)

        ## use the pid here to lock changer
        self.drive_select = drive_select
        self.tape = Changer(self.version,
                            self.pid,
                            self.tape_size,
                            debug=True,
                            drive_select=drive_select,
                            disk_queue=disk_queue,
                            debug_threshold=debug_threshold)

        self.dump_list = []
        self.tape_index = 0
        self.tape_used_size = 0  ## each dump process should write one tape worth of data
        self.dump_state_code = DumpStateCode
        self.dump_state = self.dump_state_code.initialize

    def check_credentials_file(self, credentials):
        """Run checks on a credentials file; currently just check that it exists and is not empty.
        :type credentials: string
        """
        ## return true if the credentials file exists and is not zero size
        path.isfile(credentials) and path.getsize(credentials) > 0

    def dump_pair_verify(self, tape_label_ids):
        """This is a wrapper to perform a threaded version of the
        original call to dump_verify(). Our "threading" is implemented  in three
        steps:

          1. instantiate VerifyThread (that calls dump_verify()) and start each thread
          2. wait on each thread and get the verification status code from each
          3. check each status code and return failure if either is not "OK"
        """

        ## thread instances need to be started, we can use the output to make a list of started threads
        def _start_verification(thread):
            thread.start()
            return thread

        ## join() will block until the thread completes, then we can retrieve the status from the verification
        def _get_verification_status(thread):
            thread.join()
            return thread.dump_verify_status

        ## given a pair of verification status codes, return a "non-OK" status if either is not "OK"
        def _check_thread_status(status_1, status_2):
            return status_1 if status_1 is not self.status_code.OK else status_2

        ## foreach label, start a thread and add it to a list
        started_threads = [
            _start_verification(VerifyThread(label_id, drive, self))
            for drive, label_id in enumerate(tape_label_ids)
        ]

        ## foreach thread, check the verification status and add it to a list
        return_codes = [
            _get_verification_status(thread) for thread in started_threads
        ]

        ## foreach status code, check if either is not "OK"
        return reduce(_check_thread_status, return_codes)

    def fast_batch(self):
        """skip tar of local archive on disk
           send files to two tapes using a single drive."""

        ## batch_files() does the job of making the lists that queue_archive does
        ## it also updates self.tape_index which is used by Changer.write()
        self.debug.output('reloading sample data into paperdatatest database')

        if self.batch_files():
            self.debug.output('found %s files' % len(self.files.tape_list))
            self.files.gen_final_catalog(self.files.catalog_name,
                                         self.files.tape_list,
                                         self.paperdb.file_md5_dict)
            self.tar_archive_fast(self.files.catalog_name)
            return True
        else:
            self.debug.output("no files batched")
            return self.dump_state_code.dump_list_fail

    def tar_archive_fast(self, catalog_file):
        """Archive files directly to tape using only a single drive to write 2 tapes"""

        tar_archive_fast_status = self.status_code.OK

        ## select ids
        tape_label_ids = self.labeldb.select_ids()

        ## load up a fresh set of tapes
        self.tape.load_tape_pair(tape_label_ids)

        ## add the catalog to the beginning of the tape
        for label_id in tape_label_ids:
            self.debug.output('archiving to label_id - {}'.format(label_id))

        ## prepare the first block of the tape with the current tape_catalog
        self.tape.prep_tape(catalog_file)

        ## actually write the files in the catalog to a tape pair
        self.debug.output('got list - {}'.format(self.files.tape_list))
        self.tape.archive_from_list(self.files.tape_list)

        ## check the status of the dumps
        tar_archive_fast_status = self.dump_pair_verify(tape_label_ids)

        ## unload the tape pair
        self.tape.unload_tape_pair()

        ## update the db if the current dump status is OK
        if tar_archive_fast_status is self.status_code.OK:
            log_label_ids_status = self.log_label_ids(tape_label_ids)
            if log_label_ids_status is not self.status_code.OK:
                self.debug.output('problem writing labels out: {}'.format(
                    log_label_ids_status))
        else:
            self.debug.output("Abort dump: {}".format(tar_archive_fast_status))
            self.close_dump()
예제 #11
0
class PaperDB(object):
    """Paper database contains information on file locations"""

    def __init__(self, version, credentials, pid, debug=False, debug_threshold=255):
        """Initialize connection and collect file_list of files to dump.
        :type version: int
        :type credentials: string
        :type pid: basestring
        :type debug: bool
        :type debug_threshold: int
        """

        self.pid = pid
        self.version = version
        self.debug = Debug(self.pid, debug=debug, debug_threshold=debug_threshold)
        self.status_code = StatusCode

        self.paperdb_state_code = PaperDBStateCode
        self.paperdb_state = self.paperdb_state_code.initialize
        self.connection_timeout = 90
        self.connection_time = timedelta()

        self.check_credentials_file(credentials)
        self.credentials = credentials
        self.connect = ''
        self.cur = ''
        self.db_connect('init', credentials)

        self.file_list = []
        self.file_md5_dict = {}
        self.claimed_files = []
        self.claimed_state = 0

    def __setattr__(self, attr_name, attr_value):
        """debug.output() when a state variable is updated"""
        class_name = self.__class__.__name__.lower()

        ## we always use the lowercase of the class_name in the state variable
        if attr_name == 'paperdb_state':
            ## debug whenever we update the state variable
            self.debug.output("updating: {} with {}={}".format(class_name, attr_name, attr_value))

        super().__setattr__(attr_name, attr_value)

    def check_credentials_file(self, credentials):
        """Run checks on a credentials file; currently just check that it exists and is not empty.
        this class should really implement a more thorough credentials file check since this check
        is replicated in the dump class already.

        Parameters:
        :type credentials: string
        """
        ## return true if the credentials file exists and is not zero size
        path.isfile(credentials) and path.getsize(credentials) > 0

    def update_connection_time(self):
        """refresh database connection time"""
        self.debug.output('updating connection_time')
        self.connection_time = datetime.now()

    def connection_time_delta(self):
        """return connection age"""
        self.debug.output('connection_time:%s' % self.connection_time)
        delta = datetime.now() - self.connection_time
        return delta.total_seconds()

    def db_connect(self, command=None, credentials=None):
        """connect to the database or reconnect an old session"""
        self.debug.output('input:%s %s' % (command, credentials))
        self.credentials = credentials if credentials is not None else self.credentials
        time_delta = self.connection_timeout + 1 if command == 'init' else self.connection_time_delta()

        self.debug.output("time_delta:%s, timeout:%s" % (time_delta, self.connection_timeout))
        if time_delta > self.connection_timeout:
            self.debug.output("setting connection %s %s" % (credentials, self.connection_timeout))
            self.connect = pymysql.connect(read_default_file=self.credentials, connect_timeout=self.connection_timeout)
            self.cur = self.connect.cursor()

        self.update_connection_time()
        self.debug.output("connection_time:%s" % self.connection_time)

    def get_new(self, size_limit, regex=False, pid=False):
        """Retrieve a file_list of available files.

        Outputs files that are "write_to_tape"
        Optionally, limit search by file_path regex or pid in tape_index

        Arguments:
        :param size_limit: int
        :param regex: str
        :param pid: bool
        """

        if regex:
            ready_sql = """select source, filesize, md5sum from File
                where source is not null
                and filetype like 'uv%'
                and is_tapeable = 1 
                and tape_index is null
                and source like '%s'
            """ % regex
        elif pid:
            ready_sql = """select source, filesize, md5sum from File
                where tape_index = 1{0:s}
            """.format(pid)
        else:
            ready_sql = """select source, filesize, md5sum from File
                where source is not null 
                and filetype like 'uv%'
                and is_tapeable = 1 
                and tape_index is null
                group by source order by filename;
            """

        self.db_connect()
        self.cur.execute(ready_sql)
        self.update_connection_time()

        self.file_list = []
        total = 0

        for file_info in self.cur.fetchall():
            self.debug.output('found file - %s' % file_info[0], debug_level=254)
            file_size = float(file_info[1])

            ## when size_limit is set to 0, change limit to 1 plus total + file_size
            if size_limit == 0:
                size_limit = total + file_size + 1

            ## if the reported size is larger than the size limit we have a problem
            if file_size > size_limit:
                self.debug.output('file_size (%s) larger than size limit(%s) - %s' % (file_size, size_limit, file_info[0]), debug_level=254)

            ## check that we don't go over the limit
            if total+file_size < size_limit:
                self.debug.output('file:', file_info[0], debug_level=254)
                self.file_list.append(file_info[0])
                self.file_md5_dict[file_info[0]] = file_info[2]
                total += file_size

        return self.file_list, total

    def enumerate_paths(self):
        ## run query with no size limit
        ## remove "is_tapeable=1"
        ready_sql = """select source from File
                        where source is not null
                        and filetype like 'uv%'
                        /* and is_tapeable = 1 */
                        and tape_index is null
                        group by source order by filename;
                    """

        self.db_connect()
        self.cur.execute(ready_sql)
        self.update_connection_time()

        dir_list = {}
        for file_info in self.cur.fetchall():
            ## parse paths
            ## like $host:/{mnt/,}$base/$subpath/$file
            path_regex = re.compile(r'(.*:)(/mnt/|/)(\w+)/')
            path_info = path_regex.match(file_info[0]).groups()
            base_path = path_info[0] + path_info[1] + path_info[2]
            dir_list[base_path] = dir_list[base_path] + 1 if base_path in dir_list else 0

        ## return array
        return dir_list

    def claim_files(self, file_list=None, unclaim=False):
        """Mark files in the database that are "claimed" by a dump process."""

        status_type = self.paperdb_state.value
        ## if no file_list is passed assume we are updating existing file_list
        if file_list is None:
            file_list = self.claimed_files

        claim_files_status = self.status_code.OK
        self.db_connect()

        ## build an sql to unclaim the given files
        for file_name in file_list:

            if unclaim is True:
                update_sql = "update File set tape_index=null where source='%s' and tape_index='%s%s'" % (file_name, status_type, self.pid)
            else:
                ## TODO(dconover): allow claim to use current state
                status_type = self.paperdb_state_code.claim.value
                update_sql = "update File set tape_index='%s%s' where source='%s'" % (status_type, self.pid, file_name)

            self.debug.output('claim_files - %s' % update_sql)
            try:
                self.cur.execute(update_sql)
            except Exception as mysql_error:
                self.debug.output('mysql_error {}'.format(mysql_error))
                claim_files_status = self.status_code.claim_files_sql_build

        ## run the actual sql to unclaim the files
        try:
            self.connect.commit()
            self.claimed_state = status_type
            self.claimed_files.extend(file_list)
        except Exception as mysql_error:
            self.debug.output('mysql_error {}'.format(mysql_error))
            claim_files_status = self.status_code.claim_files_sql_commit

        self.paperdb_state = self.paperdb_state_code.claim
        return claim_files_status

    def unclaim_files(self, file_list=None):
        """Release claimed files from database
        :rtype : bool
        """

        self.claim_files(file_list, unclaim=True)

    def write_tape_index(self, tape_list, tape_id):
        """Take a dictionary of files and labels and update the database

        record the barcode of tape in the tape_index field, but not
        setting the is_deletable field to 1 for all files just written to tape.
        :param tape_list: dict
        :param tape_id: str
        """

        write_tape_index_status = self.status_code.OK
        self.debug.output("tape_list contains %s files, and with ids: %s" % (len(tape_list), tape_id))
        self.db_connect()

        ## item file_list is set in paper_io.py: self.tape_list.append([queue_pass, int, file])
        for item in tape_list:
            ## tape_index: 20150103[PAPR2001,PAPR2001]-132:3
            tape_index = "%s[%s]-%s:%s" % (self.version, tape_id, item[0], item[1])
            source = item[2]
            self.debug.output("writing tape_index: %s for %s" % (tape_index, source))
            try:
                self.cur.execute('update File set tape_index="%s", is_deletable=1 where source="%s"' % (tape_index, source))
            except Exception as mysql_error:
                self.debug.output('error {}'.format(mysql_error))
                write_tape_index_status = self.status_code.write_tape_index_mysql

        try:
            self.connect.commit()
        except Exception as mysql_error:
            self.debug.output('error {}'.format(mysql_error))
            write_tape_index_status = self.status_code.write_tape_index_mysql

        return write_tape_index_status

    def check_tape_locations(self, catalog_list, tape_id):
        """Take a dictionary of files and labels and confirm existence of files on tape.

        :param catalog_list: dict
        :param tape_id: str
        """

        pass


    def close_paperdb(self):
        """depending on state clean-up file claims"""

        def _close():
            """close the database leave any files in place
            :rtype : bool
            """

            _close_status = True
            try:
                ## close database connections
                self.cur.close()
            except Exception as mysql_error:
                self.debug.output('mysql error {}'.format(mysql_error))
                _close_status = False

            return _close_status

        def _unclaim():
            """unlcaim files in database; close database
            :rtype : bool
            """
            #_unclaim_status = True
            self.unclaim_files()
            return _close()

        close_action = {
            self.paperdb_state_code.initialize : _close,
            self.paperdb_state_code.claim : _unclaim,
            self.paperdb_state_code.claim_queue : _close,
            self.paperdb_state_code.claim_write : _close,
            self.paperdb_state_code.claim_verify : _close,
            }

        self.db_connect()
        self.update_connection_time()
        close_action[self.paperdb_state]()

    def __del__(self):
        """close out the connection and set the final state in the database"""
        ## TODO(dconover): depending on self.paperdb_state update paperdata
        ## can self.status_type be replaced with self.paperdb_state?
        ## TODO(dconover): implement self.status_type; update paperdb_state="{}{}".format(self.status_type, self.pid)
        ## TODO(dconover): close database; implement self.db_close()
        pass
예제 #12
0
class RamTar(object):
    """handling python tarfile opened directly against tape devices"""
    def __init__(self,
                 pid,
                 drive_select=1,
                 rewrite_path=None,
                 debug=False,
                 debug_threshold=128):
        """initialize"""

        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.drive_select = drive_select
        self.rewrite_path = rewrite_path
        ## if we're not using disk queuing we open the drives differently;
        ## we need to track different states
        ## for faster archiving we keep some data in memory instead of queuing to disk
        self.archive_bytes = BytesIO()
        self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes)
        self.archive_info = tarfile.TarInfo()

        ## tape opened with tar
        ## this is a dictionary where we will do:
        ## self.tape_drive[drive_int] = tarfile.open(mode='w:')
        self.tape_filehandle = {}
        self.tape_drive = {}

        ## if we use tarfile, we need to track the state
        self.drive_states = RamTarStateCode
        self.drive_state = self.ramtar_tape_drive(drive_select,
                                                  self.drive_states.drive_init)

    def ramtar_tape_drive(self, drive_int, request):
        """open, close, update state, or reserve a drive for another process

        :rtype : Enum
        """

        self.debug.output('reqeust - {}'.format(request))
        action_return = []

        ## TODO(dconover): prly don't need this?
        def init_tar_drive():
            """Mark the given drives as available
            """

            new_state = {}
            if int(drive_int) == 2:
                self.debug.output('drive_select==2')
                for _loop_drive_int in 0, 1:
                    new_state[_loop_drive_int] = self.drive_states.drive_init
            else:
                self.debug.output('init single - {}'.format(drive_int))
                reserve_drive = 0 if drive_int == 1 else 1
                new_state[drive_int] = self.drive_states.drive_init
                new_state[reserve_drive] = self.drive_states.drive_reserve

            return new_state

        def open_tar_drive():
            """open a tar file against a particular drive"""
            if int(drive_int) == 2:
                for _loop_int in 0, 1:
                    ## define the actual device path
                    device_path = '/dev/nst{}'.format(drive_int)
                    if self.drive_state[
                            drive_int] is self.drive_states.drive_init:
                        self.debug.output('open tar on {}'.format(device_path))
                        ## create a filehandle for the device
                        self.tape_filehandle[drive_int] = open(device_path,
                                                               mode='wb')
                        ## send the filehandle to the tarfile
                        self.tape_drive[drive_int] = tarfile.open(
                            fileobj=self.tape_filehandle[drive_int], mode='w:')
                        self.drive_state[
                            drive_int] = self.drive_states.drive_open
                    else:
                        self.debug.output('Fail to open {}:{}'.format(
                            device_path, self.drive_state[drive_int]))
            else:
                self.debug.output(
                    'called with drive_int=={}'.format(drive_int))
                device_path = '/dev/nst{}'.format(drive_int)
                if drive_int in self.drive_state and self.drive_state[
                        drive_int] is self.drive_states.drive_init:
                    self.debug.output('open tar on {}'.format(device_path))
                    ## create a filehandle for the device
                    self.tape_filehandle[drive_int] = open(device_path,
                                                           mode='wb')
                    self.tape_drive[drive_int] = tarfile.open(
                        fileobj=self.tape_filehandle[drive_int], mode='w:')
                    self.drive_state[drive_int] = self.drive_states.drive_open
                else:
                    self.debug.output('Fail to open {}'.format(device_path))
            self.debug.output('state={}'.format(self.drive_state))
            return self.drive_state

        def close_tar_drive():
            """close a previously opened tar for a particular drive"""
            if self.drive_state[drive_int] is self.drive_states.drive_open:

                ## close tarfile
                self.tape_drive[drive_int].close()

                ## close tape_filehandle
                self.tape_filehandle[drive_int].close()

                self.drive_state[drive_int] = self.drive_states.drive_init
                self.debug.output('closed drive_int={}'.format(drive_int))
            else:
                self.debug.output('Fail to close drive_int={} ({})'.format(
                    drive_int, self.drive_state[drive_int]))

        action = {
            self.drive_states.drive_init: init_tar_drive,
            self.drive_states.drive_open: open_tar_drive,
            self.drive_states.drive_close: close_tar_drive
        }

        try:
            action_return = action[request]()
            self.debug.output('action_return = {}'.format(action_return))
        except Exception as action_exception:
            self.debug.output('tar_exception: {}'.format(action_exception))
            raise

        return action_return

    def archive_from_list(self, tape_list):
        """take a tape list, build each archive, write to tapes"""

        archive_dict = defaultdict(list)
        archive_list_dict = defaultdict(list)

        if self.drive_select == 2:
            self.debug.output('writing data to two tapes')
            ## for archive group in list
            ## build a dictionary of archives
            for item in tape_list:
                self.debug.output('item to check: {}'.format(item))
                archive_list_dict[item[0]].append(item)
                archive_dict[item[0]].append(item[-1])

            for tape_index in archive_dict:

                data_dir = '/papertape'
                archive_dir = '/papertape/queue/{}'.format(self.pid)
                archive_prefix = 'paper.{}.{}'.format(self.pid, tape_index)
                archive_name = '{}.tar'.format(archive_prefix)
                archive_file = '{}/{}'.format(archive_dir, archive_name)
                archive_list = '{}/{}.file_list'.format(
                    archive_dir, archive_prefix)

                ## rewind the archive to zero to we don't fill up ram
                self.archive_bytes = BytesIO()
                self.archive_tar = tarfile.open(mode='w:',
                                                fileobj=self.archive_bytes)

                ## for file in archive group build archive
                for item in archive_dict[tape_index]:
                    self.debug.output('item - {}..{}'.format(tape_index, item))
                    #arcname_rewrite = self.rewrite_path
                    data_path = '/'.join([data_dir, item])
                    ## TODO(dconover): remove excess leading paths from archive_path
                    archive_path = '/'.join([archive_prefix, item])
                    self.append_to_archive(data_path,
                                           file_path_rewrite=archive_path)

                ## close the file but not the bytestream
                self.archive_tar.close()
                arc = open(archive_file, mode='w')
                arc.close()

                ## send archive group to both tapes
                for drive in [0, 1]:
                    self.debug.output('send data')
                    self.send_archive_to_tape(drive, archive_list,
                                              archive_name, archive_file)

        else:
            ## I don't think its a good idea to do this since you have to read the data twice
            self.debug.output('skipping data write')
            pass

    def append_to_archive(self, file_path, file_path_rewrite=None):
        """add data to an open archive"""
        arcname = file_path if file_path_rewrite is None else file_path_rewrite
        try:
            self.debug.output('file_path={}, arcname={}'.format(
                file_path, arcname))
            self.archive_tar.add(file_path, arcname=arcname)
        except Exception as cept:
            self.debug.output('tarfile exception - {}'.format(cept))
            raise

    def send_archive_to_tape(self, drive_int, archive_list, archive_name,
                             archive_file):
        """send the current archive to tape"""
        try:
            self.debug.output('{}'.format(archive_name))
            self.ramtar_tape_drive(drive_int, self.drive_states.drive_open)
            self.debug.output('{}'.format(self.drive_state))
            ## add archive_list
            self.tape_drive[drive_int].add(archive_list)

            ## get the basic info from the blank file we wrote
            self.archive_info = self.tape_drive[drive_int].gettarinfo(
                archive_file)

            ## fix the size to the byte size of our BytesIO object
            self.archive_info.size = len(self.archive_bytes.getvalue())

            ## rewind
            self.archive_bytes.seek(0)

            ## write the bytes with info to the tape
            self.tape_drive[drive_int].addfile(tarinfo=self.archive_info,
                                               fileobj=self.archive_bytes)
            self.ramtar_tape_drive(drive_int, self.drive_states.drive_close)
            self.archive_bytes.seek(0)

        except Exception as cept:
            self.debug.output('tarfile - {}'.format(cept))
            raise

    def reset_archive(self):
        """reset the archive"""
        self.archive_bytes.seek(0)
        self.archive_bytes.truncate()
        self.archive_tar = tarfile.open(mode='w:', fileobj=self.archive_bytes)
예제 #13
0
class Drives(object):
    """class to manage low level access directly with tape (equivalient of mt level commands)

    It also can handle python directly opening or more drives with tar.
    It assumes that exactly two drives are installed, and that you will use either one, or both
    via the tape_select option
    """
    def __init__(self,
                 pid,
                 drive_select=2,
                 debug=False,
                 disk_queue=True,
                 debug_threshold=128):
        """initialize debugging and pid"""
        self.pid = pid
        self.debug = Debug(pid, debug=debug, debug_threshold=debug_threshold)
        self.drive_select = drive_select

    ## This method is deprecated because the tape self check runs though every listed archive
    def count_files(self, drive_int):
        """count the number of files on the current tape in the given drive"""
        drive = "/dev/nst%s" % drive_int
        bash_to_count_files = """
            _count_files_on_tape () {  ## count the number of files on tape
                local _count=0; 
                while :; do  
                    mt -f /dev/nst0 fsf 1 ||break
                    let _count+=1
                done
    
                echo $_count
            }
 
        """
        output = check_output(bash_to_count_files,
                              shell=True).decode('utf8').split('\n')

        return int(output[0])

    def tar_files(self, files):
        """send files in a file_list to drive(s) with tar"""
        commands = []
        for drive_int in range(self.drive_select):
            commands.append('tar cf /dev/nst%s  %s ' %
                            (drive_int, ' '.join(files)))
        self.exec_commands(commands)

    def tar_fast(self, files):
        """send catalog file and file_list of source files to tape as archive"""

    def tar(self, file_name):
        """send the given file_name to a drive(s) with tar"""
        commands = []
        for drive_int in range(self.drive_select):
            commands.append('tar cf /dev/nst%s %s ' % (drive_int, file_name))
        self.exec_commands(commands)

    def dd(self, text_file):
        """write text contents to the first 32k block of a tape"""
        commands = []
        for drive_int in range(self.drive_select):
            commands.append('dd conv=sync,block of=/dev/nst%s if=%s bs=32k' %
                            (drive_int, text_file))
        self.exec_commands(commands)

    def dd_read(self, drive_int):
        """assuming a loaded tape, read off the first block from the tape and
        return it as a string"""

        command = [
            'dd', 'conv=sync,block',
            'if=/dev/nst%s' % drive_int, 'bs=32k'
        ]
        self.debug.output('%s' % command)
        output = check_output(command).decode('utf8').split('\n')

        return output[:-1]

    def dd_duplicate(self, source_drive_int, destination_drive_int):
        """copy a tape from one drive to the other using dd"""
        source_dev = 'if=/dev/nst{}'.format(source_drive_int)
        destination_dev = 'of=/dev/nst{}'.format(destination_drive_int)

        command = ['dd', 'conf=sync,block', source_dev, destination_dev]
        self.debug.output('{}'.format(command))
        output = check_output(command).decode('utf8').split('\n')

    def md5sum_at_index(self,
                        job_pid,
                        tape_index,
                        directory_path,
                        drive_int=0):
        """given a tape_index and drive_int, return the md5sum of the file
        at that index on the tape in /dev/nst$drive_index."""

        self.debug.output("getting md5 of file at %s in drive %s" %
                          (tape_index, drive_int))

        commands = []
        ## the index is stored like: [PAPR1001, PAPR2001]-0:1
        ## the first number gives the file on tape
        ## the second number gives the file on tar
        ## but the tar is inside another tar with the full file table
        ## to get at an indexed file you must do something like:
        ##
        bash_to_md5_selected_file = """
            _block_md5_file_on_tape () {

                local _fsf=1
                local _job_pid=${1:-030390297}
                local _tape_index=${2:-1}
                local _test_path=${3:-data-path}
                local _tape_dev=${4:-0}

                local _tar_number=$_tape_index
                local _archive_tar=papertape/shm/paper.$_job_pid.$_tar_number.tar
                local _test_file=$_test_path/visdata

                ## extract the archive tar, then extract the file to stdout, then run md5 on stdin
                mt -f /dev/nst$_tape_dev fsf $_fsf &&
                    tar xOf /dev/nst$_tape_dev $_archive_tar|
                        tar xOf - paper.$_job_pid.$_tape_index/$_test_file|
                            md5sum|awk '{print $1}'
            }

            _block_md5_file_on_tape %s %s %s %s
        """ % (job_pid, tape_index, directory_path, drive_int)

        #self.debug.output(bash_to_md5_selected_file, debug_level=252)
        self.debug.output("reading %s" % directory_path)

        try:
            ## check output
            output = check_output(bash_to_md5_selected_file,
                                  shell=True).decode('utf8').split('\n')
            ## we should check the output
            self.debug.output('output: %s' % output[0], debug_level=250)

        except CalledProcessError as return_info:
            self.debug.output('return_info: %s' % return_info)
            return False

        return output[0]

    def exec_commands(self, cmds):
        """ Exec commands in parallel in multiple process
        (as much as we have CPU)
        """
        if not cmds: return  # empty file_list

        def done(proc):
            self.debug.output('process done')
            return proc.poll() is not None

        def success(proc):
            self.debug.output('process success')
            return proc.returncode == 0

        def fail():
            self.debug.output('process fail, now what?')
            return

        processes = []
        while True:
            while cmds:
                task = cmds.pop()
                processes.append(Popen(task, shell=True))

            for process in processes:
                self.debug.output('{}'.format(process.args))
                if done(process):
                    if success(process):
                        processes.remove(process)
                    else:
                        fail()
                        ## if we don't remove the process it will loop infinitely
                        ## on the other hand if we proceed without escalating the failure properly,
                        ## we'll probably keep running into the same problem with subsequent runs
                        ## TODO(dconover): update return value to terminate dump cleanly
                        ## processes.remove(process)

            if not processes and not cmds:
                self.debug.output('break')
                break
            else:
                self.debug.output('sleep', debug_level=250)
                time.sleep(5)
예제 #14
0
class Changer(object):
    """simple tape changer class"""
    def __init__(self,
                 version,
                 pid,
                 tape_size,
                 disk_queue=True,
                 drive_select=2,
                 debug=False,
                 debug_threshold=255):
        """init with debugging
        :type drive_select: int
        :param drive_select: 0 = nst0, 1 = nst1, 2 = nst{1,2}
        :type disk_queue: bool
        :param disk_queue: write archives to a disk queue first?
        """

        self.version = version
        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)
        self.tape_size = tape_size
        self._tape_dev = '/dev/changer'
        self.status_code = StatusCode
        self.drive_select = drive_select
        self.archive_tar = ''

        self.drive_ids = []
        self.tape_ids = []
        self.label_in_drive = []  ## return label in given drive

        self.check_inventory()
        self.tape_drives = Drives(self.pid,
                                  drive_select=drive_select,
                                  debug=debug,
                                  debug_threshold=debug_threshold)

        self.disk_queue = disk_queue
        if not self.disk_queue:
            ## we need to use Ramtar
            self.ramtar = FastTar(pid,
                                  drive_select=drive_select,
                                  rewrite_path=None,
                                  debug=debug,
                                  debug_threshold=debug_threshold)
        ## TODO(dconover): implement a lock on the changer to prevent overlapping requests
        self.changer_state = 0

    def check_inventory(self):
        """check the current inventory of the library with mtx"""
        output = check_output(['mtx', 'status']).decode("utf-8")
        self.debug.output(output, debug_level=251)
        self.drive_ids, self.tape_ids, self.label_in_drive = split_mtx_output(
            output)
        for drive_id in self.drive_ids:
            self.debug.output(
                '- %s, %s num_tapes: %d' %
                (id, self.drive_ids[drive_id], len(self.tape_ids)))

    def print_inventory(self):
        """print out the current tape library inventory"""
        for drive_id in self.drive_ids:
            print('drive: %s, %s' % (id, self.drive_ids[drive_id]))
        for drive_id in self.tape_ids:
            print('slot: %s, %s' % (id, self.tape_ids[drive_id]))

    def get_tape_slot(self, tape_id):
        """return the slot number where the given tape is currently loaded"""
        return self.tape_ids[tape_id]

    def load_tape_pair(self, tape_ids):
        """load the next available tape pair"""
        load_tape_pair_status = True

        if len(tape_ids) == 2:
            for drive, tape_id in enumerate(tape_ids):
                if load_tape_pair_status is True:
                    self.debug.output('loading', str(id), str(drive))
                    load_tape_pair_status = self.load_tape_drive(tape_id,
                                                                 drive=drive)
                else:
                    self.debug.output(
                        'load failure for tape_id - {}'.format(tape_id))
        else:
            self.debug.output('failed to load tape pair: %s' % tape_ids)
            load_tape_pair_status = False

        return load_tape_pair_status

    ## using type hinting with Sphinx
    ## pycharm doesn't seem to like PEP 3107 style type hinting
    def load_tape_drive(self, tape_id, drive=0):
        """load a given tape_id into a given drive=drive_int, unload if necessary.
        :type  tape_id: label of tape to load
        :param tape_id: label of tape to load"""
        status = False

        self.debug.output('check then load - {}, {}'.format(tape_id, drive))

        for attempt in range(3):
            if self.drives_empty(drive_int=drive):
                self.debug.output('calling load_tape - ',
                                  str(tape_id),
                                  str(drive),
                                  debug_level=128)
                self.load_tape(tape_id, drive)
                status = True
                break

            ## return if the drive already contains the tape we want
            ## just rewind
            elif str(drive) in self.label_in_drive and self.label_in_drive[str(
                    drive)] == tape_id:
                ## if we call this function we probably need a rewind
                self.debug.output('tape loaded; rewinding tape - {}:{}'.format(
                    str(drive), tape_id))
                self.rewind_tape(tape_id)
                status = True
                break

            ## if the drive is full attempt to unload, then retry
            else:
                self.debug.output(
                    'different tape loaded, unloading - {}:{}'.format(
                        str(self.label_in_drive), str(drive)),
                    debug_level=128)
                self.unload_tape_drive(drive)

        return status

    def unload_tape_pair(self):
        """unload the tapes in the current drives"""
        if not self.drives_empty():
            for tape_id in self.drive_ids:
                self.debug.output('unloading', tape_id)
                self.unload_tape(tape_id)

    def unload_tape_drive(self, tape_int):
        """unload the tapes in the current drives"""
        self.debug.output('unloading {}'.format(tape_int))
        if not self.drives_empty(drive_int=tape_int):
            self.debug.output('unloading {} from {}'.format(
                self.label_in_drive[str(tape_int)], tape_int))
            self.unload_tape(self.label_in_drive[str(tape_int)])
        else:
            self.debug.output('tape already empty', str(tape_int))

    def drives_empty(self, drive_int=None):
        """return true if the drives are currently empty"""
        self.debug.output('recheck inventory')
        self.check_inventory()

        if drive_int is not None:
            self.debug.output('called for {} while loaded: {}'.format(
                drive_int, self.label_in_drive))
            if str(drive_int) in self.label_in_drive:
                self.debug.output('drive loaded already - {} with {}'.format(
                    drive_int, self.label_in_drive[str(drive_int)]))
                return False
            else:
                self.debug.output('drive_empty')
                return True
        else:
            ## TODO(dconover): what's this now?
            self.debug.output('basic check drive labels: %s' %
                              self.label_in_drive)
            return not len(self.drive_ids)

    @property
    def drives_loaded(self):
        """return true if the drives are loaded"""
        self.check_inventory()
        if len(self.drive_ids):
            return self.get_drive_tape_ids()
        else:
            return False

    def get_drive_tape_ids(self):
        """get the tape_ids currently loaded in drives"""
        self.check_inventory()
        return self.drive_ids

    def load_tape(self, tape_id, tape_drive):
        """Load a tape into a free drive slot"""
        load_tape_status = True
        try:
            if self.tape_ids[tape_id]:
                self.debug.output('Loading - %s' % tape_id)
                output = check_output([
                    'mtx', 'load',
                    str(self.tape_ids[tape_id]),
                    str(tape_drive)
                ])
                self.check_inventory()
        except KeyError:
            self.debug.output('tape not in storage - {}'.format(tape_id))
            load_tape_status = False

        return load_tape_status

    def unload_tape(self, tape_id):
        """Unload a tape from a drive and put in the original slot"""
        if self.drive_ids[tape_id]:
            command = [
                'mtx', 'unload', self.drive_ids[tape_id][1],
                self.drive_ids[tape_id][0]
            ]
            self.debug.output('%s' % command)
            output = check_output(command)
            self.check_inventory()
        else:
            self.debug.output('tape_id({}) not in drive'.format(tape_id))

    def rewind_tape(self, tape_id):
        """rewind the tape in the given drive"""

        status = False

        try:
            if self.drive_ids[tape_id]:
                self.debug.output('rewinding tape %s' % tape_id)
                output = check_output('mt -f /dev/nst%s rewi' %
                                      (self.drive_ids[tape_id][0]),
                                      shell=True)
                status = True

        except CalledProcessError:
            self.debug.output('rewind error')

        except KeyError:
            self.debug.output('tape (%s) not loaded: %s' %
                              (tape_id, self.drive_ids))

        return status

    def write(self, tape_index, tape_list=None):
        """write data to tape"""
        ## tar dir to two drives
        arcname = "paper.%s.%s" % (self.pid, tape_index)
        tar_name = "/papertape/queue/%s/%s.tar" % (self.pid, arcname)
        catalog_name = "/papertape/queue/%s/%s.file_list" % (self.pid, arcname)

        if self.disk_queue:
            self.debug.output("writing", catalog_name, tar_name)
            self.tape_drives.tar_files([catalog_name, tar_name])
        elif self.disk_queue and tape_list:
            ## what should we do with a disk queue and a tape_list?
            self.debug.output('disk queue with tape_list given')
            raise Exception
        #   self.ramtar.send_archive_to_tape()
        elif not self.disk_queue and tape_list:
            ## write a fast archive to disk using the given list of files
            ##self.ramtar.archive_from_list(tape_list)
            self.debug.output('unnecessary call to write?')

        elif not self.disk_queue and not tape_list:
            self.debug.output('no list given')
            raise Exception

    def prep_tape(self, catalog_file):
        """write the catalog to tape. write all of our source code to the first file"""
        ## write catalog
        self.debug.output("writing catalog to tape", catalog_file)
        self.tape_drives.dd(catalog_file)
        ## write source code
        #self.tape_drives.tar('/root/git/papertape')

    def read_tape_catalog(self, tape_id):
        """read and return first block of tape"""

        self.rewind_tape(tape_id)
        drive_int = self.drive_ids[tape_id][0]

        return self.tape_drives.dd_read(drive_int)

    def count_files(self, tape_id):
        """count files of the given tape"""
        self.rewind_tape(tape_id)
        drive_int = self.drive_ids[tape_id][0]

        return self.tape_drives.count_files(drive_int)

    def tape_archive_md5(self,
                         tape_id,
                         job_pid,
                         catalog_list,
                         md5_dict,
                         drive=0):
        """loop through each archive on tape and check a random file md5 from each

        :rtype : bool"""

        ## default to True
        tape_archive_md5_status = self.status_code.OK
        reference = None

        self.debug.output('loading tape: %s' % tape_id)
        ## load a tape or rewind the existing tape
        self.load_tape_drive(tape_id, drive)
        drive_int = self.drive_ids[tape_id][0]

        ## for every tar advance the tape
        ## select a random path from the tape
        ## run md5sum_at_index(tape_index, drive_int=0)
        archive_dict = defaultdict(list)

        ## build a dictionary of archives
        for item in catalog_list:
            self.debug.output('item to check: %s' % item)
            archive_dict[item[0]].append(item[-1])

        for tape_index in archive_dict:
            directory_path = random.choice(archive_dict[tape_index])
            ## starting at the beginning of the tape we can advance one at a
            ## time through each archive and test one directory_path/visdata md5sum
            self.debug.output('checking md5sum for %s' % directory_path)
            md5sum = self.tape_drives.md5sum_at_index(job_pid,
                                                      tape_index,
                                                      directory_path,
                                                      drive_int=drive_int)
            if md5sum != md5_dict[directory_path]:
                self.debug.output('mdsum does not match: %s, %s' %
                                  (md5sum, md5_dict[directory_path]))
                tape_archive_md5_status = self.status_code.tape_archive_md5_mismatch
                reference = ":".join([str(tape_index), directory_path])
                break
            else:
                self.debug.output('md5 match: %s|%s' %
                                  (md5sum, md5_dict[directory_path]))

        self.unload_tape(tape_id)
        return tape_archive_md5_status, reference

    def close_changer(self):
        """cleanup"""
        ## TODO(dconover): implement changer locking; remove lock
        pass

    def append_to_archive(self, file_path, file_path_rewrite=None):
        """add data to an open archive"""
        arcname = file_path if file_path_rewrite is None else file_path_rewrite
        try:
            self.debug.output('file_path={}, arcname={}'.format(
                file_path, arcname))
            self.archive_tar.add(file_path, arcname=arcname)
        except Exception as cept:
            self.debug.output('tarfile exception - {}'.format(cept))
            raise

    def archive_from_list(self, tape_list):
        """take a tape list, build each archive, write to tapes"""

        archive_dict = defaultdict(list)
        archive_list_dict = defaultdict(list)

        if self.drive_select == 2:
            self.debug.output('writing data to two tapes')
            ## for archive group in list
            ## build a dictionary of archives
            for item in tape_list:
                self.debug.output('item to check: {}'.format(item))
                archive_list_dict[item[0]].append(item)
                archive_dict[item[0]].append(item[-1])

            for tape_index in archive_dict:

                data_dir = '/papertape'
                archive_dir = '/papertape/queue/{}'.format(self.pid)
                archive_prefix = 'paper.{}.{}'.format(self.pid, tape_index)
                archive_name = '{}.tar'.format(archive_prefix)
                #archive_file =  '{}/{}'.format(archive_dir,archive_name)
                archive_file = '{}/shm/{}'.format(data_dir, archive_name)
                archive_list = '{}/{}.file_list'.format(
                    archive_dir, archive_prefix)

                self.archive_tar = tarfile.open(archive_file, mode='w:')

                ## for file in archive group build archive
                for item in archive_dict[tape_index]:
                    self.debug.output('item - {}..{}'.format(tape_index, item))
                    #arcname_rewrite = self.rewrite_path
                    data_path = '/'.join([data_dir, item])
                    ## TODO(dconover): remove excess leading paths from archive_path
                    archive_path = '/'.join([archive_prefix, item])
                    self.append_to_archive(data_path,
                                           file_path_rewrite=archive_path)

                ## close the file
                self.archive_tar.close()

                ## send archive group to both tapes
                self.debug.output('send data')
                self.send_archive_to_tape(archive_list, archive_name,
                                          archive_file)

        else:
            ## I don't think its a good idea to do this since you have to read the data twice
            self.debug.output('skipping data write')
            pass

    def send_archive_to_tape(self, archive_list, archive_name, archive_file):
        """send the current archive to tape"""
        try:
            self.debug.output('{}'.format(archive_name))
            ## add archive_list, and archive_file
            self.tape_drives.tar_files([archive_list, archive_file])

            ## truncate the current archive to save disk space
            archive_open = open(archive_file, 'w')
            archive_open.truncate(0)

        except Exception as cept:
            self.debug.output('tarfile - {}'.format(cept))
            raise
예제 #15
0
class MtxDB(object):
    """db to handle record of label ids

    Field     Type    Null    Key     Default Extra
    id        mediumint(9)    NO      PRI     NULL    auto_increment
    label     char(8) YES             NULL
    date      int(11) YES             NULL
    status    int(11) YES             NULL
    capacity  int(11) YES             NULL

    """
    def __init__(self,
                 version,
                 credentials,
                 pid,
                 debug=False,
                 debug_threshold=255):
        """Initialize connection and collect file_list of tape_ids."""

        self.version = version
        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)
        self.status_code = StatusCode

        ## database variables
        self.connection_timeout = 90
        self.connection_time = datetime.timedelta()
        self.credentials = credentials
        self.connect = ''
        self.cur = ''
        self.db_connect('init', credentials)

        self.mtxdb_state = 0  ## current dump state

    def __setattr__(self, attr_name, attr_value):
        """debug.output() when a state variable is updated"""
        class_name = self.__class__.__name__.lower()

        ## we always use the lowercase of the class_name in the state variable
        if attr_name == '{}_state'.format(class_name):
            ## debug whenever we update the state variable
            self.debug.output("updating: {} with {}={}".format(
                class_name, attr_name, attr_value))
        super(self.__class__, self).__setattr__(attr_name, attr_value)

    def update_connection_time(self):
        """refresh database connection"""
        self.debug.output('updating connection_time')
        self.connection_time = datetime.datetime.now()

    def connection_time_delta(self):
        """return connection age"""
        self.debug.output('connection_time:%s' % self.connection_time)
        delta = datetime.datetime.now() - self.connection_time
        return delta.total_seconds()

    def db_connect(self, command=None, credentials=None):
        """connect to the database or reconnect an old session"""
        self.debug.output('input:%s %s' % (command, credentials))
        self.credentials = credentials if credentials is not None else self.credentials
        time_delta = self.connection_timeout + 1 if command == 'init' else self.connection_time_delta(
        )

        self.debug.output("time_delta:%s, timeout:%s" %
                          (time_delta, self.connection_timeout))
        if time_delta > self.connection_timeout:
            self.debug.output("setting connection %s %s" %
                              (credentials, self.connection_timeout))
            self.connect = pymysql.connect(
                read_default_file=self.credentials,
                connect_timeout=self.connection_timeout)
            self.cur = self.connect.cursor()

        self.update_connection_time()
        self.debug.output("connection_time:%s" % self.connection_time)

    def get_capacity(self, tape_id):
        select_sql = "select capacity from ids where id='%s'" % tape_id

    def select_ids(self):
        """select lowest matching id pairs"""

        self.db_connect()
        ids = []
        for n in [0, 1]:
            select_sql = """select label from ids
                where date is null and
                label like 'H0C%d%s'
                order by label
            """ % (n + 1, "%")

            self.cur.execute(select_sql)

            #print(self.cur.fetchone()[0])
            ids.append(self.cur.fetchone()[0])
        return ids

    def insert_ids(self, ids):
        """Add new tape_ids to the mtxdb"""
        self.db_connect()
        for label_id in ids:
            insert_sql = "insert into ids (label) values('%s')" % label_id
            print(insert_sql)
            self.cur.execute(insert_sql)

        self.connect.commit()

    def claim_ids(self, ids):
        """Mark files in the database that are "claimed" by a dump process."""
        self.db_connect()
        for tape_id in ids:
            claim_query = '''update ids 
                set status="%s", description="Paper dump version:%s"
                where label="%s"''' % (self.pid, self.version, tape_id)

            self.debug.output(claim_query)
            self.cur.execute(claim_query)

        self.connect.commit()

    def date_ids(self, ids):
        """write the date of our completed run to tape"""
        date_ids_status = self.status_code.OK
        date = datetime.datetime.now().strftime('%Y%m%d-%H%M')
        self.db_connect()
        for tape_id in ids:
            self.debug.output('updating mtxdb: %s, %s' % (date, tape_id))
            date_sql = 'update ids set date="%s" where label="%s"' % (date,
                                                                      tape_id)
            try:
                self.cur.execute(date_sql)
            except Exception as mysql_error:
                self.debug.output('error {}'.format(mysql_error))
                date_ids_status = self.status_code.date_ids_mysql

        try:
            self.connect.commit()
        except Exception as mysql_error:
            self.debug.output('error {}'.format(mysql_error))
            date_ids_status = self.status_code.date_ids_mysql

        return date_ids_status

    def write(self, src_directory):
        """take a path like /dev/shm/1003261778 and create a tar archive on two tapes"""

        self.update_unused_capacity()
        pass

    def update_unused_capacity(self, used=None):
        """Write out unused capacity to database."""
        self.db_connect()
        pass

    def close_mtxdb(self):
        """cleanup mtxdb state
        """
        ## TODO(dconover): dependent on self.mtx_state: claim/unclaim tapes; close mtxdb
        pass
예제 #16
0
class Archive(object):
    """Build file archives for tape dumps"""
    def __init__(self,
                 version,
                 pid,
                 debug=False,
                 debug_threshold=255,
                 local_transfer=True):
        """Archive file and tar management

        :type version: int
        :type pid: basestring
        :type local_transfer: bool
        :type debug_threshold: int
        :type debug: bool
        :type self: object
        """

        self.pid = pid
        self.debug = Debug(self.pid,
                           debug=debug,
                           debug_threshold=debug_threshold)

        self.version = version
        #self.transfer = LocalTransfer() if local_transfer else Transfer()
        self.transfer = LocalTransfer() if local_transfer else None

        dir_status, self.archive_copy_dir = self.ensure_dir(
            '/papertape/shm/%s/' % self.pid)
        dir_status, self.queue_dir = self.ensure_dir('/papertape/queue/%s/' %
                                                     self.pid)

        if dir_status is not True:
            self.debug.output('data dir init failed')
            raise Exception

        self.catalog_name = "{0:s}/paper.{1:s}.file_list".format(
            self.queue_dir, self.pid)
        self.tape_ids_filename = "{0:s}/paper.{1:s}.tape_ids.file_list".format(
            self.queue_dir, self.pid)
        self.archive_list = []  ## working file_list of files to write
        self.tape_list = []  ## cumulative file_list of written files
        self.item_index = 0  ## number of file path index (human readable line numbers in catalog)
        self.archive_state = 0  ## current archive state

    def __setattr__(self, attr_name, attr_value):
        """debug.output() when a state variable is updated"""
        class_name = self.__class__.__name__.lower()

        ## we always use the lowercase of the class_name in the state variable
        if attr_name == '{}_state'.format(class_name):
            ## debug whenever we update the state variable
            self.debug.output("updating: {} with {}={}".format(
                class_name, attr_name, attr_value))
        super(self.__class__, self).__setattr__(attr_name, attr_value)

    def ensure_dir(self, file_path):
        """make sure the directory exists creating it if necessary
        :param file_path: path to make if it doesn't already exist
        :type file_path: str
        """

        ensure_dir_status = True
        dir_path = os.path.dirname(file_path)
        if not os.path.exists(dir_path):
            try:
                os.makedirs(dir_path)
            except Exception as error:
                self.debug.output('mkdir error {}'.format(error))
                ensure_dir_status = False

        return ensure_dir_status, dir_path

    def build_archive(self, file_list, source_select=None):
        """Copy files to /dev/shm/$PID, create md5sum data for all files"""
        for file_name in file_list:
            transfer_path = '%s/%s' % (self.archive_copy_dir, file_name)
            self.debug.output("build_archive - %s" % file_name)
            get("/papertape/" + file_name,
                local_path=transfer_path,
                recursive=True)

    def gen_catalog(self, archive_catalog_file, file_list, tape_index):
        """create a catalog file_name"""
        self.debug.output("intermediate catalog: %s" % archive_catalog_file)
        # noinspection PyArgumentList
        with open(archive_catalog_file, mode='w') as cfile:
            archive_index = 1
            self.archive_list = []
            for file_name in file_list:
                self.debug.output('archive_list: %s %s %s' %
                                  (tape_index, archive_index, file_name),
                                  debug_level=249)
                self.archive_list.append(
                    [tape_index, archive_index, file_name])
                cfile.write("%s:%s:%s\n" %
                            (tape_index, archive_index, file_name))
                archive_index += 1

    def gen_final_catalog(self, tape_catalog_file, tape_list, md5_dict):
        """create a catalog file in /papertape/queue/$pid/$pid.file_list

        :param tape_catalog_file: str
        :param tape_list: file_list of [int, int, string]
        """
        self.debug.output('tape_list - %s' % tape_list)

        job_details = " ".join([
            self.pid,
            "(version:",
            str(self.version),
            "on",
            datetime.datetime.now().strftime('%Y%m%d-%H%M') + ")",
        ])

        preamble_lines = "\n".join([
            "## Paper dump catalog:" + job_details,
            "## This tape contains files as listed below:",
            "## item_index:tape_index:archive_index:data_md5:dir_path(host:fullpath)\n"
        ])

        self.item_index = 1

        with open(tape_catalog_file, mode='w') as cfile:
            ## write a preamble to describe the contents
            cfile.write(preamble_lines)

            ## write the actual tape_list
            for file_path in tape_list:
                self.debug.output("%s - %s" % (tape_catalog_file, file_path))
                self.debug.output("file_inf - %s, %s" %
                                  (self.item_index, file_path),
                                  debug_level=249)

                ## which archive on tape has the file_path
                tape_index = file_path[0]
                ## where on the archive is the file_path
                archive_index = file_path[1]
                ## what is the file_path
                file_path = file_path[2]
                ## what is the md5sum of the file_path/visdata_file
                data_md5 = md5_dict[file_path]

                ## We don't actually need the item_index; it is a convenience to the user
                ## when reading the catalog
                catalog_line = [
                    self.item_index, tape_index, archive_index, data_md5,
                    file_path
                ]
                output = ':'.join(str(x) for x in catalog_line) + "\n"

                ## write the tape_catalog to a file
                cfile.write(output)
                self.item_index += 1

            self.item_index -= 1

    def final_from_file(self, catalog=None, tape_ids=False):
        """gen final catalog from file_name"""
        self.archive_list = []
        md5_dict = {}
        pid = ''
        item_index = 0

        ## catalog includes a human readable preamble with dump info
        ## and numbered lines of items like:
        ## "item_index:tape_index:archive_index:visdata_md5sum:directory_path"
        header_line = re.compile('## Paper dump catalog:([0-9]+)')
        catalog_line = re.compile(
            '([0-9]+):([0-9]+):([0-9]+):([a-f0-9]{32}):(.*)')

        if catalog:
            self.debug.output('reading from string')
            catalog_lines = catalog

        else:
            ## read from file_name
            self.debug.output('reading from file_name')
            with open(self.catalog_name, mode='r') as file_name:
                catalog_lines = file_name.readlines()

        for line in catalog_lines:
            if catalog_line.match(line):
                ## split the line into groups
                catalog_info = catalog_line.match(line).groups()

                ## the first number is mostly for human consumption
                item_index = int(catalog_info[0])

                ## the original catalog looks like the last three entries
                tape_index = int(catalog_info[1])
                archive_index = int(catalog_info[2])
                file_path = catalog_info[4]
                md5_dict[file_path] = catalog_info[3]

                catalog_list = [tape_index, archive_index, file_path]

                self.archive_list.append(catalog_list)

            elif header_line.match(line):
                self.debug.output('found header line')
                pid = header_line.match(line).groups()[0]

        return item_index, self.archive_list, md5_dict, pid

    def queue_archive(self, tape_index, file_list):
        """move the archive from /dev/shm to a tar file in the queue directory
           once we have 1.5tb of data we will create a catalog and write all the queued
           archives to tape.
        """
        arcname = "%s.%s.%s" % ('paper', self.pid, tape_index)
        tar_name = "%s/%s.tar" % (self.queue_dir, arcname)
        catalog_name = "%s/%s.file_list" % (self.queue_dir, arcname)

        ## make the tar in the queue_directory
        self.tar_archive(self.archive_copy_dir, arcname, tar_name)

        ## make room for additional transfers
        self.rm_archive_copy_dir_list(file_list)

        ## make the catalog
        self.gen_catalog(catalog_name, file_list, tape_index)

    def tar_fast_archive(self, tape_id, file_list):
        """send tar of file chunks directly to tape."""
        arcname = "%s.%s.%s" % ('paper', self.pid, tape_id)
        tar_name = "%s/%s.tar" % (self.queue_dir, arcname)
        catalog_name = "%s/%s.file_list" % (self.queue_dir, arcname)

        ## make the tar in the queue_directory
        self.tar_archive(self.archive_copy_dir, arcname, tar_name)

        ## make the catalog
        self.gen_catalog(catalog_name, file_list, tape_id)

    def rm_archive_copy_dir_list(self, file_list):
        """remove the given directory tree of files that have been copied into
        the temporary archive_copy_dir

        :param file_list: file_list of files
        :type  file_list: list
        """
        for dir_path in file_list:
            shutil.rmtree('%s/%s' % (self.archive_copy_dir, dir_path))

    def tar_archive(self, source, arcname, destination):
        """create the queued tar for the archive file"""
        archive_file = tarfile.open(destination, mode='w')
        archive_file.add(source, arcname=arcname)
        archive_file.close()

    def md5(self, directory_prefix, file_path):
        """return an md5sum for a file"""
        full_path = '%s/%s' % (directory_prefix, file_path)
        hasher = hashlib.md5()
        with open('{}.md5sum'.format(full_path), mode='w') as hash_file:
            with open(full_path, mode='rb') as open_file:
                file_buffer = open_file.read()
                hasher.update(file_buffer)

            hash_file.write('%s\n' % hasher.hexdigest())
        return hasher.hexdigest

    def save_tape_ids(self, tape_ids):
        """open a file and write the tape ids in case writing to the db fails"""

        self.debug.output('saving {0:s} to {1:s}'.format(
            tape_ids, self.tape_ids_filename))
        tape_id_file = open(self.tape_ids_filename, mode='w')
        tape_id_file.write("[{0:s}]\n".format(tape_ids))
        tape_id_file.close()

    def tape_ids_from_file(self):
        """Assuming you init from queued run, read in the tape ids from the
        tape_ids_file"""

        tape_ids = ''
        tape_id_line = re.compile("\[(.*)\]")
        self.debug.output('{0:s}'.format(self.tape_ids_filename),
                          debug_level=128)
        with open(self.tape_ids_filename, mode='r') as tape_id_file:
            self.debug.output("opening_file", debug_level=128)
            for line in tape_id_file:
                self.debug.output('{0:s}'.format(line), debug_level=240)
                if tape_id_line.match(line):
                    tape_info = tape_id_line.match(line).groups()
                    tape_ids = tape_info[0]

        id_list = tape_ids.split(",")
        return id_list

    def close_archive(self):
        """release any locks from the changer"""
        pass