def get_files(conn): """Find all the files used by the experiment by reading the trace. """ files = {} access_files = [set()] # Finds run timestamps, so we can sort input/output files by run proc_cursor = conn.cursor() executions = proc_cursor.execute(''' SELECT timestamp FROM processes WHERE parent ISNULL ORDER BY id; ''') run_timestamps = [r_timestamp for r_timestamp, in executions][1:] proc_cursor.close() # Adds dynamic linkers for libdir in (Path('/lib'), Path('/lib64')): if libdir.exists(): for linker in libdir.listdir('*ld-linux*'): for filename in find_all_links(linker, True): if filename not in files: f = TracedFile(filename) f.read(None) files[f.path] = f # Loops on executed files, and opened files, at the same time cur = conn.cursor() rows = cur.execute(''' SELECT 'exec' AS event_type, name, NULL AS mode, timestamp FROM executed_files UNION ALL SELECT 'open' AS event_type, name, mode, timestamp FROM opened_files ORDER BY timestamp; ''') executed = set() run = 0 for event_type, r_name, r_mode, r_timestamp in rows: if event_type == 'exec': r_mode = FILE_READ r_name = Path(normalize_path(r_name)) # Stays on the current run while run_timestamps and r_timestamp > run_timestamps[0]: del run_timestamps[0] access_files.append(set()) run += 1 # Adds symbolic links as read files for filename in find_all_links( r_name.parent if r_mode & FILE_LINK else r_name, False): if filename not in files: f = TracedFile(filename) f.read(run) files[f.path] = f # Go to final target if not r_mode & FILE_LINK: r_name = r_name.resolve() if event_type == 'exec': executed.add(r_name) if r_name not in files: f = TracedFile(r_name) files[f.path] = f else: f = files[r_name] if r_mode & FILE_READ: f.read(run) if r_mode & FILE_WRITE: f.write(run) # Mark the parent directory as read if r_name.parent not in files: fp = TracedFile(r_name.parent) fp.read(run) files[fp.path] = fp # Identifies input files if r_name.is_file() and r_name not in executed: access_files[-1].add(f) cur.close() # Further filters input files inputs = [ [ fi.path for fi in lst # Input files are regular files, if fi.path.is_file() and # ONLY_READ, fi.runs[r] == TracedFile.ONLY_READ and # not executable, # FIXME : currently disabled; only remove executed files # not fi.path.stat().st_mode & 0b111 and fi.path not in executed and # not in a system directory not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs) ] for r, lst in enumerate(access_files) ] # Identify output files outputs = [ [ fi.path for fi in lst # Output files are regular files, if fi.path.is_file() and # WRITTEN fi.runs[r] == TracedFile.WRITTEN and # not in a system directory not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs) ] for r, lst in enumerate(access_files) ] # Run the list of files through the filter plugins run_filter_plugins(files, inputs) # Files removed from plugins should be removed from inputs as well inputs = [[path for path in lst if path in files] for lst in inputs] # Displays a warning for READ_THEN_WRITTEN files read_then_written_files = [ fi for fi in files.values() if fi.what == TracedFile.READ_THEN_WRITTEN and not any( fi.path.lies_under(m) for m in magic_dirs) ] if read_then_written_files: logger.warning( "Some files were read and then written. We will only pack the " "final version of the file; reproducible experiments shouldn't " "change their input files") logger.info("Paths:\n%s", ", ".join(str(fi.path) for fi in read_then_written_files)) files = set(fi for fi in files.values() if fi.what != TracedFile.WRITTEN and not any( fi.path.lies_under(m) for m in magic_dirs)) return files, inputs, outputs
def get_files(conn): """Find all the files used by the experiment by reading the trace. """ files = {} access_files = [set()] # Finds run timestamps, so we can sort input/output files by run proc_cursor = conn.cursor() executions = proc_cursor.execute( ''' SELECT timestamp FROM processes WHERE parent ISNULL ORDER BY id; ''') run_timestamps = [r_timestamp for r_timestamp, in executions][1:] proc_cursor.close() # Adds dynamic linkers for libdir in (Path('/lib'), Path('/lib64')): if libdir.exists(): for linker in libdir.listdir('*ld-linux*'): for filename in find_all_links(linker, True): if filename not in files: f = TracedFile(filename) f.read(None) files[f.path] = f # Loops on executed files, and opened files, at the same time cur = conn.cursor() rows = cur.execute( ''' SELECT 'exec' AS event_type, name, NULL AS mode, timestamp FROM executed_files UNION ALL SELECT 'open' AS event_type, name, mode, timestamp FROM opened_files ORDER BY timestamp; ''') executed = set() run = 0 for event_type, r_name, r_mode, r_timestamp in rows: if event_type == 'exec': r_mode = FILE_READ r_name = Path(normalize_path(r_name)) if event_type == 'exec': executed.add(r_name) # Stays on the current run while run_timestamps and r_timestamp > run_timestamps[0]: del run_timestamps[0] access_files.append(set()) run += 1 # Adds symbolic links as read files for filename in find_all_links(r_name.parent if r_mode & FILE_LINK else r_name, False): if filename not in files: f = TracedFile(filename) f.read(run) files[f.path] = f # Go to final target if not r_mode & FILE_LINK: r_name = r_name.resolve() if r_name not in files: f = TracedFile(r_name) files[f.path] = f else: f = files[r_name] if r_mode & FILE_WRITE: f.write(run) # Mark the parent directory as read if r_name.parent not in files: fp = TracedFile(r_name.parent) fp.read(run) files[fp.path] = fp elif r_mode & FILE_READ: f.read(run) # Identifies input files if r_name.is_file() and r_name not in executed: access_files[-1].add(f) cur.close() # Further filters input files inputs = [[fi.path for fi in lst # Input files are regular files, if fi.path.is_file() and # ONLY_READ, fi.runs[r] == TracedFile.ONLY_READ and # not executable, # FIXME : currently disabled; only remove executed files # not fi.path.stat().st_mode & 0b111 and fi.path not in executed and # not in a system directory not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)] for r, lst in enumerate(access_files)] # Identify output files outputs = [[fi.path for fi in lst # Output files are regular files, if fi.path.is_file() and # WRITTEN fi.runs[r] == TracedFile.WRITTEN and # not in a system directory not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)] for r, lst in enumerate(access_files)] # Displays a warning for READ_THEN_WRITTEN files read_then_written_files = [ fi for fi in itervalues(files) if fi.what == TracedFile.READ_THEN_WRITTEN and not any(fi.path.lies_under(m) for m in magic_dirs)] if read_then_written_files: logging.warning( "Some files were read and then written. We will only pack the " "final version of the file; reproducible experiments shouldn't " "change their input files:\n%s", ", ".join(unicode_(fi.path) for fi in read_then_written_files)) files = set( fi for fi in itervalues(files) if fi.what != TracedFile.WRITTEN and not any(fi.path.lies_under(m) for m in magic_dirs)) return files, inputs, outputs