def dot(self, fp, level_processes): assert self.processes if level_processes == LVL_PROC_RUN: fp.write(' run%d [label="%d: %s"];\n' % ( self.nb, self.nb, self.processes[0].binary or "-")) else: fp.write(' subgraph cluster_run%d {\n label="%s";\n' % ( self.nb, escape(self.name))) for process in self.processes: if level_processes == LVL_PROC_THREAD or not process.thread: process.dot(fp, level_processes, indent=2) fp.write(' }\n')
def dot(self, fp, level_processes): assert self.processes if level_processes == LVL_PROC_RUN: fp.write(' run%d [label="%d: %s"];\n' % (self.nb, self.nb, self.processes[0].binary or "-")) else: fp.write(' subgraph cluster_run%d {\n label="%s";\n' % (self.nb, escape(self.name))) for process in self.processes: if level_processes == LVL_PROC_THREAD or not process.thread: process.dot(fp, level_processes, indent=2) fp.write(' }\n')
def dot(self, fp, level_processes, indent=1): thread_style = ',fillcolor="#666666"' if self.thread else '' fp.write(' ' * indent + 'prog%d [label="%s (%d)"%s];\n' % (self.id, escape(unicode_(self.binary) or "-"), self.pid, thread_style)) if self.parent is not None: reason = '' if self.created == C_FORK: if self.thread: reason = "thread" else: reason = "fork" elif self.created == C_EXEC: reason = "exec" elif self.created == C_FORKEXEC: reason = "fork+exec" fp.write(' ' * indent + 'prog%d -> prog%d [label="%s"];\n' % (self.parent.id, self.id, reason))
def dot(self, fp, level_processes, indent=1): thread_style = ',fillcolor="#666666"' if self.thread else '' fp.write(' ' * indent + 'prog%d [label="%s (%d)"%s];\n' % ( self.id, escape(unicode_(self.binary) or "-"), self.pid, thread_style)) if self.parent is not None: reason = '' if self.created == C_FORK: if self.thread: reason = "thread" else: reason = "fork" elif self.created == C_EXEC: reason = "exec" elif self.created == C_FORKEXEC: reason = "fork+exec" fp.write(' ' * indent + 'prog%d -> prog%d [label="%s"];\n' % ( self.parent.id, self.id, reason))
def dot(self, fp, level_pkgs): assert self.id is not None if not self.files: return if level_pkgs == LVL_PKG_PACKAGE: fp.write(' "pkg %s" [shape=box,label=' % escape(self.name)) if self.version: fp.write('"%s %s"];\n' % (escape(self.name), escape(self.version))) else: fp.write('"%s"];\n' % escape(self.name)) elif level_pkgs == LVL_PKG_FILE: fp.write(' subgraph cluster_pkg%d {\n label=' % self.id) if self.version: fp.write('"%s %s";\n' % (escape(self.name), escape(self.version))) else: fp.write('"%s";\n' % escape(self.name)) for f in sorted(unicode_(f) for f in self.files): fp.write(' "%s";\n' % escape(f)) fp.write(' }\n')
def dot(self, fp, level_pkgs): assert self.id is not None if not self.files: return if level_pkgs == LVL_PKG_PACKAGE: fp.write(' "pkg %s" [shape=box,label=' % escape(self.name)) if self.version: fp.write('"%s %s"];\n' % ( escape(self.name), escape(self.version))) else: fp.write('"%s"];\n' % escape(self.name)) elif level_pkgs == LVL_PKG_FILE: fp.write(' subgraph cluster_pkg%d {\n label=' % self.id) if self.version: fp.write('"%s %s";\n' % ( escape(self.name), escape(self.version))) else: fp.write('"%s";\n' % escape(self.name)) for f in sorted(unicode_(f) for f in self.files): fp.write(' "%s";\n' % escape(f)) fp.write(' }\n')
def generate(target, configfile, database, all_forks=False): """Main function for the graph subcommand. """ # In here, a file is any file on the filesystem. A binary is a file, that # gets executed. A process is a system-level task, identified by its pid # (pids don't get reused in the database). # What I call program is the couple (process, binary), so forking creates a # new program (with the same binary) and exec'ing creates a new program as # well (with the same process) # Because of this, fork+exec will create an intermediate program that # doesn't do anything (new process but still old binary). If that program # doesn't do anything worth showing on the graph, it will be erased, unless # all_forks is True (--all-forks). # Reads package ownership from the configuration if not configfile.is_file(): logging.critical("Configuration file does not exist!\n" "Did you forget to run 'reprozip trace'?\n" "If not, you might want to use --dir to specify an " "alternate location.") sys.exit(1) runs, packages, other_files = load_config(configfile, canonical=False) packages = dict((f.path, pkg) for pkg in packages for f in pkg.files) if PY3: # On PY3, connect() only accepts unicode conn = sqlite3.connect(str(database)) else: conn = sqlite3.connect(database.path) conn.row_factory = sqlite3.Row # This is a bit weird. We need to iterate on all types of events at the # same time, ordering by timestamp, so we decorate-sort-undecorate # Decoration adds timestamp (for sorting) and tags by event type, one of # 'process', 'open' or 'exec' # Reads processes from the database process_cursor = conn.cursor() process_rows = process_cursor.execute( ''' SELECT id, parent, timestamp FROM processes ORDER BY id ''') processes = {} all_programs = [] # ... and opened files... file_cursor = conn.cursor() file_rows = file_cursor.execute( ''' SELECT name, timestamp, mode, process FROM opened_files ORDER BY id ''') binaries = set() files = OrderedSet() edges = OrderedSet() # ... as well as executed files. exec_cursor = conn.cursor() exec_rows = exec_cursor.execute( ''' SELECT name, timestamp, process, argv FROM executed_files ORDER BY id ''') # Loop on all event lists logging.info("Getting all events from database...") rows = heapq.merge(((r[2], 'process', r) for r in process_rows), ((r[1], 'open', r) for r in file_rows), ((r[1], 'exec', r) for r in exec_rows)) for ts, event_type, data in rows: if event_type == 'process': r_id, r_parent, r_timestamp = data if r_parent is not None: parent = processes[r_parent] binary = parent.binary else: parent = None binary = None p = Process(r_id, parent, r_timestamp, False, binary, C_INITIAL if r_parent is None else C_FORK) processes[r_id] = p all_programs.append(p) elif event_type == 'open': r_name, r_timestamp, r_mode, r_process = data r_name = PosixPath(r_name) if r_mode != FILE_WDIR: process = processes[r_process] files.add(r_name) edges.add((process, r_name, r_mode, None)) elif event_type == 'exec': r_name, r_timestamp, r_process, r_argv = data r_name = PosixPath(r_name) process = processes[r_process] binaries.add(r_name) # Here we split this process in two "programs", unless the previous # one hasn't done anything since it was created via fork() if not all_forks and not process.acted: process.binary = r_name process.created = C_FORKEXEC process.acted = True else: process = Process(process.pid, process, r_timestamp, True, # Hides exec only once r_name, C_EXEC) all_programs.append(process) processes[r_process] = process argv = tuple(r_argv.split('\0')) if not argv[-1]: argv = argv[:-1] edges.add((process, r_name, None, argv)) process_cursor.close() file_cursor.close() conn.close() # Puts files in packages logging.info("Organizes packages...") package_files = {} other_files = [] for f in files: pkg = packages.get(f) if pkg is not None: package_files.setdefault((pkg.name, pkg.version), []).append(f) else: other_files.append(f) # Writes DOT file with target.open('w', encoding='utf-8', newline='\n') as fp: fp.write('digraph G {\n /* programs */\n node [shape=box];\n') # Programs logging.info("Writing programs...") for program in all_programs: fp.write(' prog%d [label="%s (%d)"];\n' % ( id(program), program.binary or "-", program.pid)) if program.parent is not None: reason = '' if program.created == C_FORK: reason = "fork" elif program.created == C_EXEC: reason = "exec" elif program.created == C_FORKEXEC: reason = "fork+exec" fp.write(' prog%d -> prog%d [label="%s"];\n' % ( id(program.parent), id(program), reason)) fp.write('\n node [shape=ellipse];\n\n /* system packages */\n') # Files from packages logging.info("Writing packages...") for i, ((name, version), files) in enumerate(iteritems(package_files)): fp.write(' subgraph cluster%d {\n label=' % i) if version: fp.write('"%s %s";\n' % (escape(name), escape(version))) else: fp.write('"%s";\n' % escape(name)) for f in files: fp.write(' "%s";\n' % escape(unicode_(f))) fp.write(' }\n') fp.write('\n /* other files */\n') # Other files logging.info("Writing other files...") for f in other_files: fp.write(' "%s"\n' % escape(unicode_(f))) fp.write('\n') # Edges logging.info("Connecting edges...") for prog, f, mode, argv in edges: if mode is None: fp.write(' "%s" -> prog%d [color=blue, label="%s"];\n' % ( escape(unicode_(f)), id(prog), escape(' '.join(argv)))) elif mode & FILE_WRITE: fp.write(' prog%d -> "%s" [color=red];\n' % ( id(prog), escape(unicode_(f)))) elif mode & FILE_READ: fp.write(' "%s" -> prog%d [color=green];\n' % ( escape(unicode_(f)), id(prog))) fp.write('}\n')
def graph_dot(target, runs, packages, other_files, package_map, edges, inputs_outputs, level_pkgs, level_processes, level_other_files): """Writes a GraphViz DOT file from the collected information. """ with target.open('w', encoding='utf-8', newline='\n') as fp: fp.write('digraph G {\n /* programs */\n' ' node [shape=box fontcolor=white ' 'fillcolor=black style=filled];\n') # Programs logging.info("Writing programs...") for run in runs: run.dot(fp, level_processes) fp.write('\n' ' node [shape=ellipse fontcolor="#131C39" ' 'fillcolor="#C9D2ED"];\n') # Packages if level_pkgs not in (LVL_PKG_IGNORE, LVL_PKG_DROP): logging.info("Writing packages...") fp.write('\n /* system packages */\n') for package in sorted(packages, key=lambda pkg: pkg.name): package.dot(fp, level_pkgs) fp.write('\n /* other files */\n') # Other files logging.info("Writing other files...") for fi in sorted(other_files): if fi in inputs_outputs: fp.write(' "%(path)s" [fillcolor="#A3B4E0", ' 'label="%(name)s\\n%(path)s"];\n' % { 'path': escape(unicode_(fi)), 'name': inputs_outputs[fi] }) else: fp.write(' "%s";\n' % escape(unicode_(fi))) fp.write('\n') # Edges logging.info("Connecting edges...") done_edges = set() for prog, fi, mode, argv in edges: endp_prog = prog.dot_endpoint(level_processes) if fi in package_map: if level_pkgs == LVL_PKG_DROP: continue endp_file = package_map[fi].dot_endpoint(fi, level_pkgs) e = endp_prog, endp_file, mode if e in done_edges: continue else: done_edges.add(e) else: endp_file = '"%s"' % escape(unicode_(fi)) if mode is None: fp.write(' %s -> %s [style=bold, label="%s"];\n' % (endp_file, endp_prog, escape(format_argv(argv)))) elif mode & FILE_WRITE: fp.write(' %s -> %s [color="#000088"];\n' % (endp_prog, endp_file)) elif mode & FILE_READ: fp.write(' %s -> %s [color="#8888CC"];\n' % (endp_file, endp_prog)) fp.write('}\n')
def dot_endpoint(self, f, level_pkgs): if level_pkgs == LVL_PKG_PACKAGE: return '"pkg %s"' % escape(self.name) else: return '"%s"' % escape(unicode_(f))
def write_cltools_module(run, dot_vistrails): input_files = run['input_files'] output_files = run['output_files'] module_name = 'reprounzip_%s' % hash_experiment_run(run)[:7] # Writes CLTools JSON definition (dot_vistrails / 'CLTools').mkdir(parents=True) cltools_module = (dot_vistrails / 'CLTools' / module_name) + '.clt' logging.info("Writing CLTools definition %s...", cltools_module) with cltools_module.open('w', encoding='utf-8', newline='\n') as fp: fp.write('{\n' ' "_comment": "This file was generated by reprounzip ' '%(version)s at %(date)s",\n\n' % { 'version': version, 'date': datetime.now().isoformat() }) # python -m reprounzip.plugins.vistrails fp.write(' "command": "%s",\n' ' "args": [\n' ' [\n' ' "constant",\n' ' "-m",\n' ' "flag",\n' ' {}\n' ' ],\n' ' [\n' ' "constant",\n' ' "reprounzip.plugins.vistrails",\n' ' "flag",\n' ' {}\n' ' ],\n' % escape(sys.executable)) # Unpacker fp.write(' [\n' ' "input",\n' ' "unpacker",\n' ' "string",\n' ' {}\n' ' ],\n') # Target directory fp.write(' [\n' ' "input",\n' ' "directory",\n' ' "string",\n' ' {}\n' ' ],\n') # Run number fp.write(' [\n' ' "input",\n' ' "run",\n' ' "string",\n' ' {}\n' ' ],\n') # Input files for i, input_name in enumerate(input_files): fp.write(' [\n' ' "input",\n' ' "input %(name)s",\n' ' "file",\n' ' {\n' ' "flag": "--input-file",\n' ' "prefix": "%(name)s:"\n' ' }\n' ' ],\n' % {'name': escape(input_name)}) # Output files for i, output_name in enumerate(output_files): fp.write(' [\n' ' "output",\n' ' "output %(name)s",\n' ' "file",\n' ' {\n' ' "flag": "--output-file",\n' ' "prefix": "%(name)s:"\n' ' }\n' ' ],\n' % {'name': escape(output_name)}) # Command-line fp.write(' [\n' ' "input",\n' ' "cmdline",\n' ' "string",\n' ' {\n' ' "flag": "--cmdline"\n' ' }\n' ' ]\n' ' ],\n') # Use "std file processing" since VisTrails <=2.1.4 has a bug without # this (also, it's inefficient) fp.write(' "options": {\n' ' "std_using_files": ""\n' ' },\n') # Makes the module check for errors fp.write(' "return_code": 0,\n') # Enable 'stdout' port fp.write(' "stdout": [\n' ' "stdout",\n' ' "file",\n' ' {}\n' ' ]\n' '}\n') return module_name
def generate(target, directory, all_forks=False): """Main function for the graph subcommand. """ # In here, a file is any file on the filesystem. A binary is a file, that # gets executed. A process is a system-level task, identified by its pid # (pids don't get reused in the database). # What I call program is the couple (process, binary), so forking creates a # new program (with the same binary) and exec'ing creates a new program as # well (with the same process) # Because of this, fork+exec will create an intermediate program that # doesn't do anything (new process but still old binary). If that program # doesn't do anything worth showing on the graph, it will be erased, unless # all_forks is True (--all-forks). database = directory / 'trace.sqlite3' # Reads package ownership from the configuration configfile = directory / 'config.yml' if not configfile.is_file(): logging.critical("Configuration file does not exist!\n" "Did you forget to run 'reprozip trace'?\n" "If not, you might want to use --dir to specify an " "alternate location.") sys.exit(1) runs, packages, other_files, patterns = load_config(configfile, canonical=False) packages = dict((f.path, pkg) for pkg in packages for f in pkg.files) if PY3: # On PY3, connect() only accepts unicode conn = sqlite3.connect(str(database)) else: conn = sqlite3.connect(database.path) # This is a bit weird. We need to iterate on all types of events at the # same time, ordering by timestamp, so we decorate-sort-undecorate # Decoration adds timestamp (for sorting) and tags by event type, one of # 'process', 'open' or 'exec' # Reads processes from the database process_cursor = conn.cursor() process_rows = process_cursor.execute( ''' SELECT id, parent, timestamp FROM processes ORDER BY id ''') processes = {} all_programs = [] # ... and opened files... file_cursor = conn.cursor() file_rows = file_cursor.execute( ''' SELECT name, timestamp, mode, process FROM opened_files ORDER BY id ''') binaries = set() files = OrderedSet() edges = OrderedSet() # ... as well as executed files. exec_cursor = conn.cursor() exec_rows = exec_cursor.execute( ''' SELECT name, timestamp, process, argv FROM executed_files ORDER BY id ''') # Loop on all event lists logging.info("Getting all events from database...") rows = heapq.merge(((r[2], 'process', r) for r in process_rows), ((r[1], 'open', r) for r in file_rows), ((r[1], 'exec', r) for r in exec_rows)) for ts, event_type, data in rows: if event_type == 'process': r_id, r_parent, r_timestamp = data if r_parent is not None: parent = processes[r_parent] binary = parent.binary else: parent = None binary = None p = Process(r_id, parent, r_timestamp, False, binary, C_INITIAL if r_parent is None else C_FORK) processes[r_id] = p all_programs.append(p) elif event_type == 'open': r_name, r_timestamp, r_mode, r_process = data r_name = PosixPath(r_name) if r_mode != FILE_WDIR: process = processes[r_process] files.add(r_name) edges.add((process, r_name, r_mode, None)) elif event_type == 'exec': r_name, r_timestamp, r_process, r_argv = data r_name = PosixPath(r_name) process = processes[r_process] binaries.add(r_name) # Here we split this process in two "programs", unless the previous # one hasn't done anything since it was created via fork() if not all_forks and not process.acted: process.binary = r_name process.created = C_FORKEXEC process.acted = True else: process = Process(process.pid, process, r_timestamp, True, # Hides exec only once r_name, C_EXEC) all_programs.append(process) processes[r_process] = process argv = tuple(r_argv.split('\0')) if not argv[-1]: argv = argv[:-1] edges.add((process, r_name, None, argv)) process_cursor.close() file_cursor.close() conn.close() # Puts files in packages logging.info("Organizes packages...") package_files = {} other_files = [] for f in files: pkg = packages.get(f) if pkg is not None: package_files.setdefault((pkg.name, pkg.version), []).append(f) else: other_files.append(f) # Writes DOT file with target.open('w', encoding='utf-8', newline='\n') as fp: fp.write('digraph G {\n /* programs */\n node [shape=box];\n') # Programs logging.info("Writing programs...") for program in all_programs: fp.write(' prog%d [label="%s (%d)"];\n' % ( id(program), program.binary or "-", program.pid)) if program.parent is not None: reason = '' if program.created == C_FORK: reason = "fork" elif program.created == C_EXEC: reason = "exec" elif program.created == C_FORKEXEC: reason = "fork+exec" fp.write(' prog%d -> prog%d [label="%s"];\n' % ( id(program.parent), id(program), reason)) fp.write('\n node [shape=ellipse];\n\n /* system packages */\n') # Files from packages logging.info("Writing packages...") for i, ((name, version), files) in enumerate(iteritems(package_files)): fp.write(' subgraph cluster%d {\n label=' % i) if version: fp.write('"%s %s";\n' % (escape(name), escape(version))) else: fp.write('"%s";\n' % escape(name)) for f in files: fp.write(' "%s";\n' % escape(unicode_(f))) fp.write(' }\n') fp.write('\n /* other files */\n') # Other files logging.info("Writing other files...") for f in other_files: fp.write(' "%s"\n' % escape(unicode_(f))) fp.write('\n') # Edges logging.info("Connecting edges...") for prog, f, mode, argv in edges: if mode is None: fp.write(' "%s" -> prog%d [color=blue, label="%s"];\n' % ( escape(unicode_(f)), id(prog), escape(' '.join(argv)))) elif mode & FILE_WRITE: fp.write(' prog%d -> "%s" [color=red];\n' % ( id(prog), escape(unicode_(f)))) elif mode & FILE_READ: fp.write(' "%s" -> prog%d [color=green];\n' % ( escape(unicode_(f)), id(prog))) fp.write('}\n')
def graph_dot(target, runs, packages, other_files, package_map, edges, inputs_outputs, level_pkgs, level_processes, level_other_files): """Writes a GraphViz DOT file from the collected information. """ with target.open('w', encoding='utf-8', newline='\n') as fp: fp.write('digraph G {\n /* programs */\n' ' node [shape=box fontcolor=white ' 'fillcolor=black style=filled];\n') # Programs logging.info("Writing programs...") for run in runs: run.dot(fp, level_processes) fp.write('\n' ' node [shape=ellipse fontcolor="#131C39" ' 'fillcolor="#C9D2ED"];\n') # Packages if level_pkgs not in (LVL_PKG_IGNORE, LVL_PKG_DROP): logging.info("Writing packages...") fp.write('\n /* system packages */\n') for package in sorted(packages, key=lambda pkg: pkg.name): package.dot(fp, level_pkgs) fp.write('\n /* other files */\n') # Other files logging.info("Writing other files...") for fi in sorted(other_files): if fi in inputs_outputs: fp.write(' "%(path)s" [fillcolor="#A3B4E0", ' 'label="%(name)s\\n%(path)s"];\n' % {'path': escape(unicode_(fi)), 'name': inputs_outputs[fi]}) else: fp.write(' "%s";\n' % escape(unicode_(fi))) fp.write('\n') # Edges logging.info("Connecting edges...") done_edges = set() for prog, fi, mode, argv in edges: endp_prog = prog.dot_endpoint(level_processes) if fi in package_map: if level_pkgs == LVL_PKG_DROP: continue endp_file = package_map[fi].dot_endpoint(fi, level_pkgs) e = endp_prog, endp_file, mode if e in done_edges: continue else: done_edges.add(e) else: endp_file = '"%s"' % escape(unicode_(fi)) if mode is None: fp.write(' %s -> %s [style=bold, label="%s"];\n' % ( endp_file, endp_prog, escape(format_argv(argv)))) elif mode & FILE_WRITE: fp.write(' %s -> %s [color="#000088"];\n' % ( endp_prog, endp_file)) elif mode & FILE_READ: fp.write(' %s -> %s [color="#8888CC"];\n' % ( endp_file, endp_prog)) fp.write('}\n')
def write_cltools_module(run, dot_vistrails): input_files = run['input_files'] output_files = run['output_files'] module_name = 'reprounzip_%s' % hash_experiment_run(run)[:7] # Writes CLTools JSON definition (dot_vistrails / 'CLTools').mkdir(parents=True) cltools_module = (dot_vistrails / 'CLTools' / module_name) + '.clt' logging.info("Writing CLTools definition %s...", cltools_module) with cltools_module.open('w', encoding='utf-8', newline='\n') as fp: fp.write('{\n' ' "_comment": "This file was generated by reprounzip ' '%(version)s at %(date)s",\n\n' % { 'version': version, 'date': datetime.now().isoformat()}) # python -m reprounzip.plugins.vistrails fp.write(' "command": "%s",\n' ' "args": [\n' ' [\n' ' "constant",\n' ' "-m",\n' ' "flag",\n' ' {}\n' ' ],\n' ' [\n' ' "constant",\n' ' "reprounzip.plugins.vistrails",\n' ' "flag",\n' ' {}\n' ' ],\n' % escape(sys.executable)) # Unpacker fp.write(' [\n' ' "input",\n' ' "unpacker",\n' ' "string",\n' ' {}\n' ' ],\n') # Target directory fp.write(' [\n' ' "input",\n' ' "directory",\n' ' "string",\n' ' {}\n' ' ],\n') # Run number fp.write(' [\n' ' "input",\n' ' "run",\n' ' "string",\n' ' {}\n' ' ],\n') # Input files for i, input_name in enumerate(input_files): fp.write(' [\n' ' "input",\n' ' "input %(name)s",\n' ' "file",\n' ' {\n' ' "flag": "--input-file",\n' ' "prefix": "%(name)s:"\n' ' }\n' ' ],\n' % {'name': escape(input_name)}) # Output files for i, output_name in enumerate(output_files): fp.write(' [\n' ' "output",\n' ' "output %(name)s",\n' ' "file",\n' ' {\n' ' "flag": "--output-file",\n' ' "prefix": "%(name)s:"\n' ' }\n' ' ],\n' % {'name': escape(output_name)}) # Command-line fp.write(' [\n' ' "input",\n' ' "cmdline",\n' ' "string",\n' ' {\n' ' "flag": "--cmdline"\n' ' }\n' ' ]\n' ' ],\n') # Use "std file processing" since VisTrails <=2.1.4 has a bug without # this (also, it's inefficient) fp.write(' "options": {\n' ' "std_using_files": ""\n' ' },\n') # Makes the module check for errors fp.write(' "return_code": 0,\n') # Enable 'stdout' port fp.write(' "stdout": [\n' ' "stdout",\n' ' "file",\n' ' {}\n' ' ]\n' '}\n') return module_name