def get_changes(self, cache_status, cached_old_path, cached_new_path): logger = logging.getLogger(__name__) is_subdir_oldpath = False is_subdir_newpath = False logger.info('Checking for changes between %s and %s', self.old_datapath, self.new_datapath) if cache_status == CacheStatus.NOT_CACHED: change_dir = comparator.compare(self.old_datapath, self.new_datapath, self.stagingdir) else: with open(self.cache_entries, 'r') as f: cache = yaml.safe_load(f) change_dir = cache[cached_new_path][cached_old_path] if cached_old_path != self.old_datapath: is_subdir_oldpath = True if cached_new_path != self.new_datapath: is_subdir_newpath = True logger.info('Retrieving changes between %s and %s', self.old_datapath, self.new_datapath) change = FilesystemChange(cached_old_path, cached_new_path, self.stagingdir) if is_subdir_newpath: indexdir = os.path.join(self.stagingdir, 'indexes', get_hash_id(cached_new_path)) subdir_nfiles = get_subdir_nfiles(self.new_datapath, indexdir) change.new_nfiles = subdir_nfiles if is_subdir_oldpath: indexdir = os.path.join(self.stagingdir, 'indexes', get_hash_id(cached_old_path)) subdir_nfiles = get_subdir_nfiles(self.old_datapath, indexdir) change.old_nfiles = subdir_nfiles change_data_dir = os.path.join(self.cachedir, change_dir) if not (is_subdir_oldpath or is_subdir_newpath): set_change_from_cache(change, change_data_dir) else: compare_hash = dacman_utils.hash_comparison_id(self.old_datapath, self.new_datapath) change_data_subdir = os.path.join(self.cachedir, compare_hash) if os.path.exists(change_data_subdir): set_change_from_cache(change, change_data_subdir) else: save_subdir_changes_to_cache(change, self.stagingdir, cached_old_path, cached_new_path, self.old_datapath, self.new_datapath, is_subdir_oldpath, is_subdir_newpath, change_data_dir, change_data_subdir) logger.info('Updating change cache entries') change_id = dacman_utils.hash_comparison_id(self.old_datapath, self.new_datapath) change_info = {self.new_datapath : {self.old_datapath: change_id}} dacman_utils.update_yaml(change_info, self.cache_entries) logger.info('Change retrieval completed') return change
def append(datapath, usermeta, custom_stagingdir=None): logger = logging.getLogger(__name__) if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) if not os.path.exists(indexdir): logger.error( 'Data is not indexed... please index before adding metadata!') sys.exit() if not usermeta: logger.warn('No user metadata provided. Exiting...') return meta_file = os.path.join(indexdir, 'METADATA') metadata = dacman_utils.load_yaml(meta_file) if not metadata: metadata = {} newmeta = '' if 'USER_DEFINED_METADATA' in metadata: newmeta = metadata['USER_DEFINED_METADATA'] newmeta += ', ' + usermeta extended_metadata = {'USER_DEFINED_METADATA': newmeta} dacman_utils.dump_yaml(extended_metadata, meta_file) logger.info('New user metadata added')
def retrieve(datapath, custom_stagingdir=None): logger = logging.getLogger(__name__) if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) if not os.path.exists(indexdir): logger.error( 'Data is not indexed... please index before retrieving metadata!') sys.exit() meta_file = os.path.join(indexdir, 'METADATA') metadata = {} metadata = dacman_utils.load_yaml(meta_file) if not metadata: metadata = {} elif 'USER_DEFINED_METADATA' in metadata: usermeta = metadata['USER_DEFINED_METADATA'] print(usermeta) else: print('No user-defined metadata available for the dataset') logger.info('User metadata retrieved')
def clean(datadirs): logger = logging.getLogger(__name__) logger.info('Removing indexes for %s', ', '.join(datadirs)) indexdir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'indexes') cachedir = os.path.join(dacman_utils.DACMAN_STAGING_LOC, 'cache') cachefile = os.path.join(cachedir, 'ENTRIES') if os.path.exists(cachefile): cache = dacman_utils.load_yaml(cachefile) for datadir in datadirs: path = os.path.abspath(datadir) if path in cache: for comp in cache[path]: cache_data = os.path.join(cachedir, cache[path][comp]) shutil.rmtree(cache_data) del cache[path] else: to_delete = [] for k in cache: for s in cache[k]: if s == path: to_delete.append([k, s]) for elem in to_delete: k, s = elem[0], elem[1] cache_data = os.path.join(cachedir, cache[k][s]) shutil.rmtree(cache_data) del cache[k][s] dacman_utils.dump_yaml(cache, cachefile) for datadir in datadirs: path = os.path.abspath(datadir) indexes = os.path.join(indexdir, get_hash_id(path)) if os.path.exists(indexes): index_file = os.path.join(indexdir, 'INDEXED_PATHS') shutil.rmtree(indexes) index_metadata = dacman_utils.load_yaml(index_file) del index_metadata[path] dacman_utils.dump_yaml(index_metadata, index_file) logger.info('Indexes removed for %s', datadir) elif os.path.exists(datadir): logger.warn('Indexes and metadata for directory %s are not staged', datadir) else: logger.error('Data directory %s does not exist', datadir)
def tigres_index(stagingdir, datapath): indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) deduce_file = os.path.join(indexdir, 'FILEPATHS') if not os.path.exists(deduce_file): scanner.scan(datapath, os.path.dirname(stagingdir)) filelist = read_filelist(deduce_file) exec_name = 'EXECUTION_DISTRIBUTE_PROCESS' exec_plugin = tigres.utils.Execution.get(exec_name) try: logfile = 'deduce_index_{}.log'.format(round(time.time())) tigres.start(name='deduce_index', log_dest=logfile, execution=exec_plugin) tigres.set_log_level(tigres.Level.ERROR) task_array = tigres.TaskArray(tasks=[]) task_hash = tigres.Task("hash_index", task_type=tigres.FUNCTION, impl_name=calculate_hash) task_array.append(task_hash) input_list = [] for file in filelist: input_list.append([datapath, file]) input_array = tigres.InputArray(values=input_list) logger.info('Indexing %d files', len(filelist)) indexes = tigres.parallel('index_files', input_array=input_array, task_array=task_array) save_indexes(indexdir, indexes) except tigres.utils.TigresException as e: print(str(e)) return_code = 1 tigres.end() return indexdir
def mp_index(stagingdir, datapath): indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) deduce_file = os.path.join(indexdir, 'FILEPATHS') if not os.path.exists(deduce_file): scanner.scan(datapath, stagingdir) filelist = read_filelist(deduce_file) logger.info('Indexing %d files', len(filelist)) num_procs = multiprocessing.cpu_count() results = [] pool = multiprocessing.Pool(processes=num_procs) for filename in filelist: result = pool.apply_async(calculate_hash, args=(datapath, filename)) results.append(result) pool.close() pool.join() indexes = [result.get() for result in results] save_indexes(indexdir, indexes) return indexdir
def compare(old_datapath, new_datapath, custom_stagingdir): logger = logging.getLogger(__name__) logger.info('Starting directory comparison') if not custom_stagingdir: #old_stagingdir = os.path.join(old_datapath, '.deduce') stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir old_indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(old_datapath)) new_indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(new_datapath)) old_index_file = os.path.join(old_indexdir, 'PATH.idx') new_index_file = os.path.join(new_indexdir, 'PATH.idx') if not os.path.exists(old_index_file): indexer.index(old_datapath, stagingdir) if not os.path.exists(new_index_file): indexer.index(new_datapath, stagingdir) old_data_index_file = os.path.join(old_indexdir, 'DATA.idx') old_pathname_map_file = os.path.join(old_indexdir, 'PATHNAME.map') old_metafile = os.path.join(old_indexdir, 'METADATA') new_metafile = os.path.join(new_indexdir, 'METADATA') #cprint(__modulename__, 'Loading Indexes') logger.info('Loading indexes for fast comparison') old_path_indexes = dacman_utils.file_to_dict(old_index_file) new_path_indexes = dacman_utils.file_to_dict(new_index_file) old_data_indexes = dacman_utils.file_to_dict(old_data_index_file) name_path_map = dacman_utils.file_to_dict_list(old_pathname_map_file) old_metadata = dacman_utils.file_to_dict(old_metafile) new_metadata = dacman_utils.file_to_dict(new_metafile) _unchanged = [] _metachange = {} _added = [] _deleted = [] _modified = {} #cprint(__modulename__, 'Comparing {} and {}'.format(old_datapath, new_datapath)) # MD5 hash for a zero-byte file __MAGIC_HASH__ = 'd41d8cd98f00b204e9800998ecf8427e' logger.info('Comparing files in %s and %s', old_datapath, new_datapath) for filepath in new_path_indexes: datahash = new_path_indexes[filepath] if filepath in old_path_indexes: ''' if filepaths are same, but data or metadata changed ''' if datahash == old_path_indexes[filepath]: if filepath in old_metadata and filepath in new_metadata: if old_metadata[filepath] == new_metadata[filepath]: _unchanged.append(filepath) else: _metachange[filepath] = filepath else: _unchanged.append(filepath) else: _modified[filepath] = filepath old_path_indexes.pop(filepath) basename = os.path.basename(filepath) if basename in name_path_map: if filepath in name_path_map[basename]: name_path_map[basename].remove(filepath) if len(name_path_map[basename]) == 0: name_path_map.pop(basename) elif os.path.basename(filepath) in name_path_map: ''' if filenames are same, but filepaths and data changed ''' filename = os.path.basename(filepath) old_filepaths = name_path_map[filename] for old_filepath in old_filepaths: if datahash == old_path_indexes[old_filepath]: _metachange[filepath] = old_filepath old_path_indexes.pop(old_filepath) name_path_map[filename].remove(old_filepath) break if filepath not in _metachange: old_filepath = old_filepaths[0] _modified[filepath] = old_filepath old_path_indexes.pop(old_filepath) del name_path_map[filename][0] if len(name_path_map[filename]) == 0: name_path_map.pop(filename) elif datahash in old_data_indexes and datahash != __MAGIC_HASH__: ''' if data remains same, but filepath changes ''' old_filepath = old_data_indexes[datahash] if old_filepath in old_path_indexes: _metachange[filepath] = old_filepath old_path_indexes.pop(old_filepath) old_data_indexes.pop(datahash) basename = os.path.basename(old_filepath) if basename in name_path_map: if old_filepath in name_path_map[basename]: name_path_map[basename].remove(old_filepath) if len(name_path_map[basename]) == 0: name_path_map.pop(basename) else: _added.append(filepath) else: _added.append(filepath) for old_filepath in old_path_indexes: _deleted.append(old_filepath) ''' Saving change information in cache ''' logger.info('Updating change cache entries') change_id = dacman_utils.hash_comparison_id(old_datapath, new_datapath) cachedir = os.path.join(stagingdir, 'cache') if not os.path.exists(cachedir): os.makedirs(cachedir) change_file = os.path.join(cachedir, 'ENTRIES') change_info = {new_datapath: {old_datapath: change_id}} dacman_utils.update_yaml(change_info, change_file) logger.info('Saving change measurements') change_dir = os.path.join(cachedir, change_id) if not os.path.exists(change_dir): os.makedirs(change_dir) _meta_info = { 'base': { 'dataset_id': old_datapath, 'nfiles': dacman_utils.get_nfiles(old_datapath, stagingdir) }, 'revision': { 'dataset_id': new_datapath, 'nfiles': dacman_utils.get_nfiles(new_datapath, stagingdir) } } _metafile = os.path.join(change_dir, 'META_INFO') _ufile = os.path.join(change_dir, 'UNCHANGED') _afile = os.path.join(change_dir, 'ADDED') _dfile = os.path.join(change_dir, 'DELETED') _mfile = os.path.join(change_dir, 'MODIFIED') _mcfile = os.path.join(change_dir, 'METACHANGE') dacman_utils.dump_yaml(_meta_info, _metafile) dacman_utils.list_to_file(_unchanged, _ufile) dacman_utils.list_to_file(_added, _afile) dacman_utils.list_to_file(_deleted, _dfile) dacman_utils.dict_to_file(_modified, _mfile) dacman_utils.dict_to_file(_metachange, _mcfile) logger.info('Directory comparison complete') return change_id
def scan(datapath, custom_stagingdir=None, nonrecursive=False, symlinks=False, details=False, ignorelist=[]): logger = logging.getLogger(__name__) if not os.path.exists(datapath): #cprint(__modulename__, 'Datapath `{}` does not exist!'.format(datapath)) logger.error('Datapath %s does not exist!', datapath) sys.exit() if not os.path.isdir(datapath): #print('Indexing currently allowed only for data in a directory.') #cprint(__modulename__, 'Indexing currently allowed only for data in a directory.') logger.error( 'Indexing currently allowed only for data in a directory.') sys.exit() if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) if not os.path.exists(indexdir): os.makedirs(indexdir) entries = [] follow_symlinks = symlinks excluded_dirs = {'.dacman': True} metainfo = {} # keeping this for feature enhancements and future optimizations #dirtree = DirectoryTree(datapath, stagingdir) #cprint(__modulename__, 'Scanning datapath {}'.format(datapath)) logger.info('Scanning datapath %s', datapath) ### Doing this is too slow ### """ for entry in scantree(datapath, excluded_dirs, follow_symlinks): filepath = entry.path relative_path = os.path.relpath(filepath, datapath) ''' ### Doing this is too slow ### # saving only info about the files and not directories if not entry.is_dir(follow_symlinks=False): metainfo[relative_path] = get_metadata(entry) ''' # keeping this for feature enhancements and future optimizations #dirtree.add(entry.path) #dirtree.save() #dirtree.close() ''' meta_path = os.path.join(stagingdir, 'METADATA') cprint(__modulename__, 'Dumping metadata') dump(metainfo, meta_path) ''' """ scan_funcs = {False: scantree, True: scan_only_dir} scan_fn = scan_funcs[nonrecursive] paths_file = os.path.join(indexdir, 'FILEPATHS') meta_file = os.path.join(indexdir, 'METADATA') # open the metadata file mf = open(meta_file, 'w') if nonrecursive: logger.info( 'Ignoring subdirectory scans: scanning files only in the present directory' ) ''' if there is no file to ignore ''' if len(ignorelist) == 0: with open(paths_file, 'w') as f: for entry in scan_fn(datapath, excluded_dirs, follow_symlinks): filepath = entry.path relative_path = os.path.relpath(filepath, datapath) ''' only save the file paths and not dir paths ''' if not entry.is_dir(follow_symlinks=symlinks): line = '{}\n'.format(relative_path) f.write(line) if details: file_stats = entry.stat() owner = pwd.getpwuid(file_stats.st_uid).pw_name group = grp.getgrgid(file_stats.st_gid).gr_name size = file_stats.st_size #mtime = datetime.fromtimestamp(file_stats.st_mtime).strftime("%d %B %Y %I:%M:%S") # File modification time doesn't make sense here, because we compare two versions #mtime = file_stats.st_mtime metadata = relative_path + ':owner=' + owner + ',group=' + group + ',size=' + str( size) + '\n' #metadata = relative_path+':owner='+owner+',group='+group+\ # ',size='+str(size)+',mtime='+str(mtime)+'\n' mf.write(metadata) else: with open(paths_file, 'w') as f: for entry in scan_fn(datapath, excluded_dirs, follow_symlinks): filepath = entry.path relative_path = os.path.relpath(filepath, datapath) ignore_file = False for ignore_pattern in ignorelist: if fnmatch.fnmatch(relative_path, ignore_pattern): ignore_file = True break ''' only save the file paths and not dir paths ''' if not (ignore_file or entry.is_dir(follow_symlinks=symlinks)): line = '{}\n'.format(relative_path) f.write(line) if details: file_stats = entry.stat() owner = pwd.getpwuid(file_stats.st_uid).pw_name group = grp.getgrgid(file_stats.st_gid).gr_name size = file_stats.st_size #mtime = datetime.fromtimestamp(file_stats.st_mtime).strftime("%d %B %Y %I:%M:%S") #mtime = file_stats.st_mtime metadata = relative_path + ':owner=' + owner + ',group=' + group + ',size=' + str( size) + '\n' #metadata = relative_path+':owner='+owner+',group='+group+\ # ',size='+str(size)+',mtime='+str(mtime)+'\n' mf.write(metadata) logger.info('Saving path metadata and directory scan information') basepath_file = os.path.join(indexdir, 'DATAPATH') with open(basepath_file, 'w') as f: f.write('{}\n'.format(datapath)) # close the metadata file mf.close() #cprint(__modulename__, 'Scan complete') logger.info('Directory scan complete') return indexdir
def mpi_index(custom_stagingdir, datapath): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() status = MPI.Status() class States(): READY = 0 START = 1 DONE = 2 EXIT = 3 if rank == 0: stagingdir = check_stagingdir(custom_stagingdir, datapath) ''' deduce_metadata = os.path.join(stagingdir, 'METADATA') if not os.path.exists(deduce_metadata): scanner.scan(datapath, stagingdir) ''' indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(datapath)) deduce_file = os.path.join(indexdir, 'FILEPATHS') if not os.path.exists(deduce_file): scanner.scan(datapath, stagingdir) filelist = [] file_num = 0 closed_workers = 0 num_workers = size - 1 indexes = [] filelist = read_filelist(deduce_file) logger.info('Indexing %d files', len(filelist)) while closed_workers < num_workers: result = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == States.READY: if file_num < len(filelist): comm.send(filelist[file_num], dest=source, tag=States.START) file_num += 1 else: comm.send(None, dest=source, tag=States.EXIT) elif tag == States.DONE: indexes.append(result) elif tag == States.EXIT: closed_workers += 1 save_indexes(indexdir, indexes) return indexdir else: while True: comm.send(None, dest=0, tag=States.READY) filename = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() if tag == States.START: index = calculate_hash(datapath, filename) comm.send(index, dest=0, tag=States.DONE) elif tag == States.EXIT: comm.send(None, dest=0, tag=States.EXIT) break
def changes(old_datapath, new_datapath, force=False, custom_stagingdir=None): logger = logging.getLogger(__name__) #old_datapath = os.path.abspath(old_path) #new_datapath = os.path.abspath(new_path) if old_datapath == new_datapath: logger.error('Comparison paths are the same') sys.exit() if not custom_stagingdir: stagingdir = dacman_utils.DACMAN_STAGING_LOC else: stagingdir = custom_stagingdir is_subdir_oldpath = False is_subdir_newpath = False cached_old_path = old_datapath cached_new_path = new_datapath cachedir = os.path.join(stagingdir, 'cache') cache_entries = os.path.join(cachedir, 'ENTRIES') logger.info('Checking for changes between %s and %s', old_datapath, new_datapath) ''' This is the caching logic where change information is saved and subsequently retrieved If no high-level diff exists for the data, then do a comparison - do the comparison for the all the indexed data - at runtime, decide if the comparison is between any subdirectories of the total diff ''' if not os.path.exists(cache_entries) or force: logger.info('Cache is empty... starting dataset comparison') change_dir = comparator.compare(old_datapath, new_datapath, stagingdir) else: logger.info('Checking for pre-calculated and cached changes.') ''' if the high-level diff exists, then check if it exists for the two data versions provided here ''' with open(cache_entries, 'r') as f: cache = yaml.safe_load(f) ''' if changes for the newpath are in cache, then check if they are for the compared oldpath ''' if new_datapath in cache: ''' if the diff paths are already compared, then get the corresponding directory; else, do the comparisons/diff ''' if old_datapath in cache[new_datapath]: ''' if both oldpath and newpath are in the cache ''' logger.info('Changes are present in cache... fetching change information.') change_dir = cache[new_datapath][old_datapath] else: ''' check if the oldpath is a subdirectory of a cached path change ''' for o in cache[new_datapath]: parent_path = o + os.sep if old_datapath.startswith(parent_path): logger.info('Changes can be derived from the cache.') change_dir = cache[new_datapath][o] cached_old_path = os.path.abspath(parent_path) break ''' if the oldpath is neither in cache nor is a subdir of a cache entry, then it's a new comparison ''' else: logger.info('Changes are not cached... initiating dataset comparison.') change_dir = comparator.compare(old_datapath, new_datapath, stagingdir) else: ''' if changes for the original newpath are not in cache, check if any parent directory changes are in cache ''' d = os.path.dirname(new_datapath) ''' check if any parent dir changes are calculated and cached ''' while d != '/' and d not in cache: d = os.path.dirname(d) ''' if changes for a matching parent are found, then check if oldpath changes are cached ''' if d != '/': if old_datapath in cache[d]: change_dir = cache[d][old_datapath] cached_new_path = d else: for o in cache[d]: parent_path = o + os.sep if old_datapath.startswith(parent_path): logger.info('Subdirectory changes can be derived from cache.') change_dir = cache[d][o] cached_old_path = os.path.abspath(parent_path) cached_new_path = d break else: logger.info('Changes are not pre-calculated... initiating dataset comparison.') change_dir = comparator.compare(old_datapath, new_datapath, stagingdir) else: ''' if changes are not present in the cache, then compare ''' logger.info('Changes are not pre-calculated... initiating dataset comparison.') change_dir = comparator.compare(old_datapath, new_datapath, stagingdir) if cached_old_path != old_datapath: is_subdir_oldpath = True if cached_new_path != new_datapath: is_subdir_newpath = True logger.info('Retrieving changes between %s and %s', old_datapath, new_datapath) #change = FilesystemChange(old_datapath, new_datapath, stagingdir) change = FilesystemChange(cached_old_path, cached_new_path, stagingdir) if is_subdir_newpath: indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(cached_new_path)) subdir_nfiles = get_subdir_nfiles(new_datapath, indexdir) change.new_nfiles = subdir_nfiles if is_subdir_oldpath: indexdir = os.path.join(stagingdir, 'indexes', get_hash_id(cached_old_path)) subdir_nfiles = get_subdir_nfiles(old_datapath, indexdir) change.old_nfiles = subdir_nfiles change_data_dir = os.path.join(cachedir, change_dir) #print(change_data_dir) if not (is_subdir_oldpath or is_subdir_newpath): set_change_from_cache(change, change_data_dir) else: compare_hash = dacman_utils.hash_comparison_id(old_datapath, new_datapath) change_data_subdir = os.path.join(cachedir, compare_hash) if os.path.exists(change_data_subdir): set_change_from_cache(change, change_data_subdir) else: save_subdir_changes_to_cache(change, stagingdir, cached_old_path, cached_new_path, old_datapath, new_datapath, is_subdir_oldpath, is_subdir_newpath, change_data_dir, change_data_subdir) logger.info('Updating change cache entries') change_id = dacman_utils.hash_comparison_id(old_datapath, new_datapath) change_file = os.path.join(cachedir, 'ENTRIES') change_info = {new_datapath : {old_datapath: change_id}} dacman_utils.update_yaml(change_info, change_file) logger.info('Change retrieval completed') return change
def get_change_pairs(self): if not (self.old_path and self.new_path): self.logger.error('Old and new datapaths are not specified!') sys.exit() change_pairs = [] old_base = self.old_path new_base = self.new_path self.logger.info('Starting diff calculation') if self.old_path_is_file and self.new_path_is_file: change_pairs.append((self.old_path, self.new_path)) return change_pairs elif self.old_path_is_file != self.new_path_is_file: self.logger.error('Datapaths are of different types') sys.exit() ''' check if indexes on the data are present else, check for data types and invoke parallel comparison ''' old_index_path = None new_index_path = None is_indexed = False indexdir = os.path.join(self.stagingdir, 'indexes') index_metafile = os.path.join(indexdir, 'INDEXED_PATHS') if os.path.exists(index_metafile): indexed_paths = dacman_utils.load_yaml(index_metafile) paths_indexed = [False, False] for path in indexed_paths: p = path + os.sep if self.old_path.startswith(p) or self.old_path == path: old_index_path = os.path.join( indexdir, get_hash_id(os.path.abspath(path))) paths_indexed[0] = True if self.new_path.startswith(p) or self.new_path == path: new_index_path = os.path.join( indexdir, get_hash_id(os.path.abspath(path))) paths_indexed[1] = True if all(paths_indexed): is_indexed = True break if is_indexed: changeManager = ChangeManager(self.old_path, self.new_path, False, self.stagingdir) status, cached_old_path, cached_new_path = changeManager.get_cached_paths( ) change_data = changeManager.get_changes(status, cached_old_path, cached_new_path) old_datapath_file = os.path.join(old_index_path, 'DATAPATH') new_datapath_file = os.path.join(new_index_path, 'DATAPATH') old_filelist = os.path.join(old_index_path, 'FILEPATHS') new_filelist = os.path.join(new_index_path, 'FILEPATHS') with open(old_datapath_file) as f: old_basepath = f.readline().split('\n')[0] with open(new_datapath_file) as f: new_basepath = f.readline().split('\n')[0] with open(old_filelist) as f: for relpath in f: filepath = os.path.join(old_basepath, relpath) if filepath == self.old_path: self.old_path_is_file = True break with open(new_filelist) as f: for relpath in f: filepath = os.path.join(new_basepath, relpath) if filepath == self.old_path: self.new_path_is_file = True break else: self.logger.warning( 'Datapaths are not indexed. Trying to locate and index the data...' ) ''' The code below allows to check for a diff between any two random files ''' # change_data = change.changes(old_base, new_base, False, self.stagingdir) changeManager = ChangeManager(old_base, new_base, False, self.stagingdir) status, cached_old_path, cached_new_path = changeManager.get_cached_paths( ) change_data = changeManager.get_changes(status, cached_old_path, cached_new_path) changes = change_data.modified self.logger.info('Searching for path indexes') ''' find the old and new base directories which are indexed through ''' path_prefix_new = cached_new_path path_prefix_old = cached_old_path ''' save the metadata about the high-level diff between the directories ''' if not self.old_path_is_file: if self.save_changes: self._save_dir_diff(change_data) self.logger.info('Change summary saved in: %s', self.outdir) change.display(change_data) ''' for each file level change, a detailed change analysis is reqd ''' for change_key in changes: new_path = os.path.join(path_prefix_new, change_key) old_path = os.path.join(path_prefix_old, changes[change_key]) change_pairs.append((new_path, old_path)) else: rel_new_path = os.path.relpath(self.new_path, path_prefix_new) rel_old_path = os.path.relpath(self.old_path, path_prefix_old) if rel_new_path in changes and changes[ rel_new_path] == rel_old_path: change_pairs.append((self.new_path, self.old_path)) return change_pairs