def test_syncfile_01(testdir): """ Attempt to sync a source file that doesn't exist Should throw an error source NO """ testdir.reset() syncopts = syncopts_defaults.copy() syncopts['tmpbase'] = os.path.abspath(testdir.psconfig.TMP_DIR) f = testdir.objects.values()[0][0] src = fsitem.FSItem('{0}xyz'.format(f.path)) tgt = fsitem.FSItem(src.absname.replace(testdir.source, testdir.target)) with pytest.raises(pylut.SyncError) as einfo: pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) assert 'No such file or directory' in einfo.value.reason
def test_syncfile_07(testdir): """ Testing existing tmp mismatch, keep tmp tmp MISMATCH tgt NO Expect tmp to get unlinked, then test is same as test_syncfile_02 Verify tmp has new FID test-pair = 11 """ testdir.reset() testdir.mk_all_tgtdirs() syncopts = syncopts_defaults.copy() syncopts.update(tmpbase=os.path.abspath(testdir.psconfig.TMP_DIR)) for f in testdir.files: src = fsitem.FSItem(f.path) tgt = fsitem.FSItem(src.absname.replace(testdir.source, testdir.target)) # initial sync to create tmp syncopts.update(keeptmp=True) tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # delete tgt os.unlink(str(tgt)) assert os.path.lexists(str(tgt)) == False tgt.update() # save current tmp FID tmp_fid_orig = tmp.inode() # change src file if src.is_regular(): # sleep is long, faster to just change the file data change = random.randint(1, 1024) _mkregfile(src, size=src.size + change) else: # no choice but sleep for non-regular files time.sleep(1) _touch(src) src.update() # sync again, expect a new tmp file tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # expect tmp is new, so ensure metadata is up to date tmp.update() # verify src and tgt are in sync assert _files_match(src, tgt, syncopts) # verify tmp and target are the same file assert _files_equal(tgt, tmp) # verify tmp has different FID assert tmp_fid_orig != tmp.inode()
def test_syncfile_02(testdir): """ Initial sync, keep tmp tmp NO target NO keeptmp YES test-pair = 03 """ testdir.reset() testdir.mk_all_tgtdirs() syncopts = syncopts_defaults.copy() syncopts.update(keeptmp=True, tmpbase=os.path.abspath(testdir.psconfig.TMP_DIR)) for f in testdir.files: src = fsitem.FSItem(f.path) tgt = fsitem.FSItem(src.absname.replace(testdir.source, testdir.target)) pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) assert _files_match(src, tgt, syncopts)
def test_syncfile_06(testdir): """ Testing existing tmp ok, keep tmp tmp OK tgt NO expect tgt file hardlink to be created tmp file should remain untouched test-pair = 10 """ testdir.reset() testdir.mk_all_tgtdirs() syncopts = syncopts_defaults.copy() syncopts.update(tmpbase=os.path.abspath(testdir.psconfig.TMP_DIR)) for f in testdir.files: src = fsitem.FSItem(f.path) tgt = fsitem.FSItem(src.absname.replace(testdir.source, testdir.target)) # initial sync to create tmp syncopts.update(keeptmp=True) tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # delete tgt os.unlink(str(tgt)) assert os.path.lexists(str(tgt)) == False tgt.update() # save original tmp FID tmp_fid_orig = tmp.inode() tmp.update() # sync again, should be fast since valid tmp already exists # starttime = time.time() tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # endtime = time.time() # verify src and tgt are in sync assert _files_match(src, tgt, syncopts) # verify tmp and target are the same file assert _files_equal(tgt, tmp) # # check that elapsed time was <1 second # elapsedtime = endtime - starttime # assert elapsedtime < 1 # verify tmp has same FID as before assert tmp_fid_orig == tmp.inode()
def test_syncfile_04(testdir): """ Testing existing target ok, keep tmp tmp NO tgt OK expect tmp file hardlink to be created, tgt file should remain untouched test-pair = 08 """ testdir.reset() testdir.mk_all_tgtdirs() syncopts = syncopts_defaults.copy() syncopts.update(tmpbase=os.path.abspath(testdir.psconfig.TMP_DIR)) for f in testdir.files: src = fsitem.FSItem(f.path) tgt = fsitem.FSItem(src.absname.replace(testdir.source, testdir.target)) # make initial sync so tgt exists, don't keep tmpfile syncopts.update(keeptmp=False) tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # save original tgt FID tgt_fid_orig = tgt.inode() tgt.update() # verify tmp does not exist assert os.path.lexists(str(tmp)) == False # sync again, keep tmpfile this time syncopts.update(keeptmp=True) # starttime = time.time() tmp, action = pylut.syncfile(src_path=src, tgt_path=tgt, **syncopts) # endtime = time.time() # # check that elapsed time was <1 second # elapsedtime = endtime - starttime # assert elapsedtime < 1 # verify tmp and target are the same file assert _files_equal(tgt, tmp) # verify tgt has same FID as before assert tgt_fid_orig == tgt.inode() # verify src and tgt are in sync assert _files_match(src, tgt, syncopts)
def syncfile(src_path, tgt_path, tmpbase=None, keeptmp=False, synctimes=False, syncperms=False, syncowner=False, syncgroup=False, pre_checksums=False, post_checksums=True): """ Lustre stripe aware file sync Copies a file to temporary location, then creates a hardlink for the target. If either the tmp or the target file already exist, that existing file will be checked for accuracy by checking size and mtime (and checksums if pre_checksum=True). If synctimes=False, tgt is assumed to be equal if tgt_mtime >= src_mtime; otherwise, if syntimes=True, tgt_mtime must be exactly equal to src_mtime or tgt will be assumed to be out of sync. If a valid tmp or tgt exist and one or more of synctimes, syncperms, syncowner, syncgroup are specified, the specified metadata attributes of tmp and/or tgt file will be checked and updated. If both tmp and tgt already exist, both will be checked for accuracy against src. If both tmp and tgt are valid (accurate matches), nothing happens. If at least one of tmp or tgt are found to exist and be valid, the invalid file will be removed and a hardlink created to point to the valid file, thus avoiding a full file copy. If keeptmp=False, the tmp file hardlink will be removed. When copying a file with multiple hard links, set keeptmp=True to keep the tempfile around so the other hard links will not result in additional file copies. It is up to the user of this function to remove the tmp files at a later time. The tmpbase parameter cannot be None (this requirement may be removed in a future version). tmpbase will be created if necessary. The tmpbase directory structure will not be removed and therefore must be cleaned up manually. If post_checksums=True (default), the checksums for src and tgt should be immediately available on the same parameters that were passed in (ie: src_path.checksum() and tgt_path.checksum() ) :param src_path FSItem: :param tgt_path FSItem: :param tmpbase str: absolute path to directory where tmp files will be created :param keeptmp bool: if True, do not delete tmpfile (default=False) :param synctimes bool: sync file times (default=False) :param syncperms bool: sync file permissions (default=False) :param syncowner bool: sync file owner (default=False) :param syncgroup bool: sync file group (default=False) :param pre_checksums bool: use checksum to determine if src and tgt differ (default=False) :param post_checksums bool: if source was copied to target, compare checksums to verify target was written correctly (default=True) :return two-tuple: 1. fsitem.FSItem: full path to tmpfile (even if keeptmp=False) 2. action_taken: dict with keys of 'data_copy' and 'meta_update' and values of True or False depending on the action taken 2. sync_results: output from rsync --itemize-changes """ if tmpbase is None: #TODO - If tmpbase is None, create one at the mountpoint # tmpbase = _pathjoin( # fsitem.getmountpoint( tgt_path ), # '.pylutsyncfiletmpbase' ) raise UserWarning('Default tmpbase not yet implemented') # Construct full path to tmpfile: base + <5-char hex value> + <INODE> try: srcfid = src_path.inode() except (Run_Cmd_Error) as e: raise SyncError(reason=e.reason, origin=e) tmpdir = _pathjoin(tmpbase, hex(hash(srcfid))[-5:]) tmp_path = fsitem.FSItem(os.path.join(tmpdir, srcfid)) log.debug('tmp_path:{0}'.format(tmp_path)) # rsync logic: what already exists on the tgt FS and what needs to be updated do_mktmpdir = False do_setstripe = False setstripe_tgt = None setstripe_stripeinfo = None do_rsync = False rsync_src = None rsync_tgt = None do_hardlink = False hardlink_src = None hardlink_tgt = None do_checksums = False sync_action = {'data_copy': False, 'meta_update': False} syncopts = { 'synctimes': synctimes, 'syncperms': syncperms, 'syncowner': syncowner, 'syncgroup': syncgroup, 'pre_checksums': pre_checksums, 'post_checksums': post_checksums, } tmp_exists, tmp_data_ok, tmp_meta_ok = (False, ) * 3 tgt_exists, tgt_data_ok, tgt_meta_ok = (False, ) * 3 tmp_exists = tmp_path.exists() if tmp_exists: log.debug('tmp exists, comparing tmp to src') tmp_data_ok, tmp_meta_ok = _compare_files(src_path, tmp_path, syncopts) tgt_exists = tgt_path.exists() if tgt_exists: log.debug('tgt exists, comparing tgt to src') tgt_data_ok, tgt_meta_ok = _compare_files(src_path, tgt_path, syncopts) if tmp_exists and tgt_exists: log.debug('tmp and tgt exist') if tmp_path.inode() == tgt_path.inode(): log.debug('tmp and tgt are same file') if tmp_data_ok: if not tmp_meta_ok: log.debug('tmp needs metadata update') sync_action['meta_update'] = True do_rsync = True rsync_src = src_path rsync_tgt = tmp_path else: log.debug('tmp not ok, unset all') os.unlink(str(tmp_path)) tmp_path.update() os.unlink(str(tgt_path)) tgt_path.update() tmp_exists, tmp_data_ok, tmp_meta_ok = (False, ) * 3 tgt_exists, tgt_data_ok, tgt_meta_ok = (False, ) * 3 else: log.debug('tmp and tgt are different files') # check if one of tmp or tgt are ok, to avoid unnecessary data transfer if tmp_data_ok: log.debug('tmp data ok, unset tgt vars') os.unlink(str(tgt_path)) tgt_path.update() tgt_exists, tgt_data_ok, tgt_meta_ok = (False, ) * 3 elif tgt_data_ok: log.debug('tgt data ok, unset tmp vars') os.unlink(str(tmp_path)) tmp_path.update() tmp_exists, tmp_data_ok, tmp_meta_ok = (False, ) * 3 else: log.debug('neither tmp nor tgt are ok, unset both') os.unlink(str(tmp_path)) tmp_path.update() os.unlink(str(tgt_path)) tgt_path.update() tmp_exists, tmp_data_ok, tmp_meta_ok = (False, ) * 3 tgt_exists, tgt_data_ok, tgt_meta_ok = (False, ) * 3 if tmp_exists != tgt_exists: # only one file exists if tmp_exists: log.debug('tmp exists, tgt doesnt') if tmp_data_ok: log.debug('tmp data ok, tgt needs hardlink') do_hardlink = True hardlink_src = tmp_path hardlink_tgt = tgt_path if not tmp_meta_ok: log.debug('tmp needs meta update') sync_action['meta_update'] = True do_rsync = True rsync_src = src_path rsync_tgt = tmp_path else: log.debug('tmp not ok, unset tmp vars') os.unlink(str(tmp_path)) tmp_path.update() tmp_exists, tmp_data_ok, tmp_meta_ok = (False, ) * 3 else: log.debug('tgt exists, tmp doesnt') if tgt_data_ok: log.debug('tgt data ok') if keeptmp: log.debug('keeptmp=True, tmp needs hardlink') do_mktmpdir = True do_hardlink = True hardlink_src = tgt_path hardlink_tgt = tmp_path else: log.debug('keeptmp=False, no action needed') if not tgt_meta_ok: log.debug('tgt needs metadata update') sync_action['meta_update'] = True do_rsync = True rsync_src = src_path rsync_tgt = tgt_path else: log.debug('tgt not ok, unset tgt vars') os.unlink(str(tgt_path)) tgt_path.update() tgt_exists, tgt_data_ok, tgt_meta_ok = (False, ) * 3 if not (tmp_exists or tgt_exists): log.debug('neither tmp nor tgt exist') sync_action.update(data_copy=True, meta_update=True) if src_path.is_regular(): do_setstripe = True setstripe_stripeinfo = src_path.stripeinfo() if keeptmp: do_mktmpdir = True setstripe_tgt = tmp_path #will be ignored if do_setstripe is False do_rsync = True rsync_src = src_path rsync_tgt = tmp_path do_hardlink = True hardlink_src = tmp_path hardlink_tgt = tgt_path do_checksums = True else: log.debug('keeptmp is false, skipping tmpfile creation') setstripe_tgt = tgt_path #will be ignored if do_setstripe is False do_rsync = True rsync_src = src_path rsync_tgt = tgt_path do_checksums = True if do_mktmpdir: # Ensure tmpdir exists log.debug('create tmpdir {0}'.format(tmpdir)) try: os.makedirs(tmpdir) except (OSError) as e: # OSError: [Errno 17] File exists if e.errno != 17: raise SyncError('Unable to create tmpdir {0}'.format(tmpdir), e) if do_setstripe: # Set stripe to create the new file with the expected stripe information log.debug('setstripe (create) {0}'.format(setstripe_tgt)) try: setstripeinfo(setstripe_tgt, count=setstripe_stripeinfo.count, size=setstripe_stripeinfo.size) except (Run_Cmd_Error) as e: msg = 'Setstripe failed for {0}'.format(setstripe_tgt) raise SyncError(msg, e) if rsync_src.size > env['PYLUTRSYNCMAXSIZE']: # DD for large files # TODO - replace dd with ddrescue (for efficient handling of sparse files) cmd = ['/bin/dd'] opts = { 'bs': 4194304, 'if': rsync_src, 'of': rsync_tgt, 'status': 'noxfer', } args = None (output, errput) = runcmd(cmd, opts, args) if len(errput.splitlines()) > 2: #TODO - it is hackish to ignore errors based on line count, better is to # use a dd that supports "status=none" raise UserWarning( "errors during dd of '{0}' -> '{1}': output='{2}' errors='{3}'" .format(rsync_src, rsync_tgt, output, errput)) if do_rsync: # Do the rsync cmd = [env['PYLUTRSYNCPATH']] opts = {'--compress-level': 0} args = ['-l', '-A', '-X', '--super', '--inplace', '--specials'] if synctimes: args.append('-t') if syncperms: args.append('-p') if syncowner: args.append('-o') if syncgroup: args.append('-g') args.extend([rsync_src, rsync_tgt]) try: (output, errput) = runcmd(cmd, opts, args) except (Run_Cmd_Error) as e: raise SyncError(reason=e.reason, origin=e) if len(errput) > 0: raise SyncError( reason="errors during sync of '{0}' -> '{1}'".format( rsync_src, rsync_tgt), origin="output='{0}' errors='{1}'".format(output, errput)) if do_hardlink: log.debug('hardlink {0} <- {1}'.format(hardlink_src, hardlink_tgt)) try: os.link(str(hardlink_src), str(hardlink_tgt)) except (OSError) as e: raise SyncError( reason='Caught exception for link {0} -> {1}'.format( hardlink_src, hardlink_tgt), origin=e) # Delete tmp if keeptmp is False: log.debug('unlink tmpfile {0}'.format(tmp_path)) try: os.unlink(str(tmp_path)) except (OSError) as e: # OSError: [Errno 2] No such file or directory if e.errno != 2: raise SyncError( 'Error attempting to delete tmp {0}'.format(tmp_path), e) #tmp_path.update() # TODO - replace rmtree with safer alternative # walk dirs backwards and rmdir each #shutil.rmtree( tmpbase ) #this will force delete everything, careful if do_checksums and post_checksums: # Compare checksums to verify target file was written accurately src_checksum = src_path.checksum() tgt_checksum = tgt_path.checksum() if src_checksum != tgt_checksum: reason = 'Checksum mismatch' origin = 'src_file={sf}, tgt_file={tf}, '\ 'src_checksum={sc}, tgt_checksum={tc}'.format( sf=src_path, tf=tgt_path, sc=src_checksum, tc=tgt_checksum ) raise SyncError(reason, origin) return (tmp_path, sync_action)
import pylut import fsitem src_path=fsitem.FSItem( '/u/staff/aloftus/lustre_version.pbs' ) tgt_path=fsitem.FSItem( '/projects/test/psynctest/lustre_version.pbs' ) pylut.syncfile( src_path, tgt_path )