def log_input(filename, source): """Log input to the database. Called by patched functions that do some sort of input (reading from a file etc) with the filename and some sort of information about the source. Note: the source parameter is currently not stored in the database. """ # Some packages, e.g., xarray, accept a list of files as input argument if isinstance(filename, list): for f in filename: log_input(f, source) return elif not isinstance(filename, six.string_types): try: filename = filename.name except: pass filename = os.path.abspath(filename) if option_set('ignored metadata', 'input_hashes'): record = filename else: record = (filename, hash_file(filename)) if option_set('general', 'debug'): print("Input from %s using %s" % (record, source)) #Update object in DB version = get_version(source) db = open_or_create_db() db.update(append("inputs", record, no_duplicates=True), eids=[RUN_ID]) db.update(append("libraries", version, no_duplicates=True), eids=[RUN_ID]) db.close()
def log_output(filename, source): """Log output to the database. Called by patched functions that do some sort of output (writing to a file etc) with the filename and some sort of information about the source. Note: the source parameter is currently not stored in the database. """ if type(filename) is not str: try: filename = filename.name except: pass filename = os.path.abspath(filename) db = open_or_create_db() if option_set('data', 'file_diff_outputs') and os.path.isfile(filename): tf = tempfile.NamedTemporaryFile(delete=False) shutil.copy2(filename, tf.name) add_file_diff_to_db(filename, tf.name, db) if option_set('general', 'debug'): print("Output to %s using %s" % (filename, source)) #Update object in DB # data hash will be hashed at script exit, if enabled db.update(append("outputs", filename, no_duplicates=True), eids=[RUN_ID]) db.update(append("libraries", get_version(source), no_duplicates=True), eids=[RUN_ID]) db.close()
def log_input(filename, source): """Log input to the database. Called by patched functions that do some sort of input (reading from a file etc) with the filename and some sort of information about the source. Note: the source parameter is currently not stored in the database. """ if type(filename) is not str: try: filename = filename.name except: pass filename = os.path.abspath(filename) if option_set('data', 'hash_inputs'): record = (filename, hash_file(filename)) else: record = filename if option_set('general', 'debug'): print("Input from %s using %s" % (record, source)) #Update object in DB db = open_or_create_db() db.update(append("inputs", record, no_duplicates=True), eids=[RUN_ID]) db.update(append("libraries", get_version(source), no_duplicates=True), eids=[RUN_ID]) db.close()
def log_output(filename, source): """Log output to the database. Called by patched functions that do some sort of output (writing to a file etc) with the filename and some sort of information about the source. Note: the source parameter is currently not stored in the database. """ if isinstance(filename, list): for f in filename: log_output(f, source) return elif not isinstance(filename, six.string_types): try: filename = filename.name except: pass filename = os.path.abspath(filename) version = get_version(source) db = open_or_create_db() if option_set('data', 'file_diff_outputs') and os.path.isfile(filename) \ and not is_binary(filename): tf = tempfile.NamedTemporaryFile(delete=False) shutil.copy2(filename, tf.name) add_file_diff_to_db(filename, tf.name, db) if option_set('general', 'debug'): print("Output to %s using %s" % (filename, source)) #Update object in DB # data hash will be hashed at script exit, if enabled db.update(append("outputs", filename, no_duplicates=True), eids=[RUN_ID]) db.update(append("libraries", version, no_duplicates=True), eids=[RUN_ID]) db.close()
def log_init(): # Get the path of the script we're running # When running python -m recipy ..., during the recipy import argument 0 # is -c (for Python 2) or -m (for Python 3) and the script is argument 1 if sys.argv[0] in ['-c', '-m']: # Has the user called python -m recipy without further arguments? if len(sys.argv) < 2: return scriptpath = os.path.realpath(sys.argv[1]) else: scriptpath = os.path.realpath(sys.argv[0]) global RUN_ID # Open the database db = open_or_create_db() # Create the unique ID for this run guid = str(uuid.uuid4()) # Get general metadata, environment info, etc run = {"unique_id": guid, "author": getpass.getuser(), "description": "", "inputs": [], "outputs": [], "script": scriptpath, "command": sys.executable, "environment": [platform.platform(), "python " + sys.version.split('\n')[0]], "date": datetime.datetime.utcnow()} if not option_set('ignored metadata', 'git'): try: repo = Repo(scriptpath, search_parent_directories=True) run["gitrepo"] = repo.working_dir run["gitcommit"] = repo.head.commit.hexsha run["gitorigin"] = get_origin(repo) if not option_set('ignored metadata', 'diff'): whole_diff = '' diffs = repo.index.diff(None, create_patch=True) for diff in diffs: whole_diff += "\n\n\n" + diff.diff.decode("utf-8") run['diff'] = whole_diff except (InvalidGitRepositoryError, ValueError): # We can't store git info for some reason, so just skip it pass # Put basics into DB RUN_ID = db.insert(run) # Print message if not option_set('general', 'quiet'): print("recipy run inserted, with ID %s" % (guid)) db.close()
def log_init(): # Get the path of the script we're running # When running python -m recipy ..., during the recipy import argument 0 # is -c (for Python 2) or -m (for Python 3) and the script is argument 1 if sys.argv[0] in ['-c', '-m']: # Has the user called python -m recipy without further arguments? if len(sys.argv) < 2: return scriptpath = os.path.realpath(sys.argv[1]) cmd_args = sys.argv[2:] else: scriptpath = os.path.realpath(sys.argv[0]) cmd_args = sys.argv[1:] global RUN_ID # Open the database db = open_or_create_db() # Create the unique ID for this run guid = str(uuid.uuid4()) # Get general metadata, environment info, etc run = {"unique_id": guid, "author": getpass.getuser(), "description": "", "inputs": [], "outputs": [], "script": scriptpath, "command": sys.executable, "environment": [platform.platform(), "python " + sys.version.split('\n')[0]], "date": datetime.datetime.utcnow(), "exit_date": None, # updated at script exit "command_args": " ".join(cmd_args)} if not option_set('ignored metadata', 'git'): add_git_info(run, scriptpath) # Put basics into DB RUN_ID = db.insert(run) # Print message if not option_set('general', 'quiet'): print("recipy run inserted, with ID %s" % (guid)) db.close() # Register exception hook so exceptions can be logged sys.excepthook = log_exception
def load_module(self, name): """Module loading method. It imports the module normally, and then calls the `patch` method to wrap the functions we need. `patch` is implemented by subclasses """ if name != self.modulename: raise ImportError( "%s can only be used to import a specific module!", self.__class__.__name__) if name in sys.modules: return sys.modules[name] # already imported and patched # Find the module file_obj, pathname, desc = recursive_find_module(name, sys.path) try: mod = imp.load_module(name, file_obj, pathname, desc) finally: if file_obj: file_obj.close() if option_set('general', 'debug'): print("Patching %s" % mod.__name__) # Actually do the patching mod = self.patch(mod) # And put the module in Python's proper namespace sys.modules[name] = mod return mod
def patch(self, mod): for f in self.functions: if option_set('general', 'debug'): print('Patching input/output function: {}'.format(f)) patch_function(mod, f, self.wrapper) return mod
def load_module(self, name): """Module loading method. It imports the module normally, and then calls the `patch` method to wrap the functions we need. `patch` is implemented by subclasses """ if name != self.modulename: raise ImportError("%s can only be used to import a specific module!", self.__class__.__name__) if name in sys.modules: return sys.modules[name] # already imported and patched # Find the module file_obj, pathname, desc = recursive_find_module(name, sys.path) try: mod = imp.load_module(name, file_obj, pathname, desc) finally: if file_obj: file_obj.close() if option_set('general', 'debug'): print("Patching %s" % mod.__name__) # Actually do the patching mod = self.patch(mod) # And put the module in Python's proper namespace sys.modules[name] = mod return mod
def output_file_diffs(): # Writing to output files is complete; we can now compute file diffs. if not option_set('data', 'file_diff_outputs'): return encodings = ['utf-8', 'latin-1'] with open_or_create_db() as db: diffs_table = db.table('filediffs') diffs = diffs_table.search(Query().run_id == RUN_ID) for item in diffs: if option_set('general', 'debug'): print('Storing file diff for "%s"' % item['filename']) lines1 = None lines2 = None for enc in encodings: try: with codecs.open(item['tempfilename'], encoding=enc) as f: lines1 = f.readlines() except UnicodeDecodeError: pass try: with codecs.open(item['filename'], encoding=enc) as f: lines2 = f.readlines() except UnicodeDecodeError: pass if lines1 is not None and lines2 is not None: diff = difflib.unified_diff(lines1, lines2, fromfile='before this run', tofile='after this run') with open_or_create_db() as db: diffs_table.update({'diff': ''.join([l for l in diff])}, eids=[item.eid]) else: msg = ('Unable to read file "{}" using supported encodings ({}). ' 'To be able to store file diffs, use one of the supported ' 'encodings to write the output file.') warnings.warn(msg.format(item['filename'], ', '.join(encodings))) # delete temporary file os.remove(item['tempfilename'])
def log_output(filename, source): filename = os.path.abspath(filename) if option_set('general', 'debug'): print("Output to %s using %s" % (filename, source)) #Update object in DB db = open_or_create_db() db.update(append("outputs", filename), eids=[RUN_ID]) db.close()
def log_exit(): # Update the record with the timestamp of the script's completion. # We don't save the duration because it's harder to serialize a timedelta. if option_set('general', 'debug'): print("recipy run complete") exit_date = datetime.datetime.utcnow() db = open_or_create_db() db.update({'exit_date': exit_date}, eids=[RUN_ID]) db.close()
def log_input(filename, source): if type(filename) is not str: try: filename = filename.name except: pass filename = os.path.abspath(filename) if option_set('data', 'hash_inputs'): record = (filename, git_hash_object(filename)) else: record = filename if option_set('general', 'debug'): print("Input from %s using %s" % (record, source)) #Update object in DB db = open_or_create_db() db.update(append("inputs", record, no_duplicates=True), eids=[RUN_ID]) db.close()
def hash_outputs(): # Writing to output files is complete; we can now compute hashes. if not option_set('data', 'hash_outputs'): return db = open_or_create_db() run = db.get(eid=RUN_ID) new_outputs = [(filename, hash_file(filename)) for filename in run.get('outputs')] db.update({'outputs': new_outputs}, eids=[RUN_ID]) db.close()
def hash_outputs(): # Writing to output files is complete; we can now compute hashes. if option_set('ignored metadata', 'output_hashes'): return db = open_or_create_db() run = db.get(eid=RUN_ID) new_outputs = [(filename, hash_file(filename)) for filename in run.get('outputs')] db.update({'outputs': new_outputs}, eids=[RUN_ID]) db.close()
def log_exception(typ, value, traceback): if option_set('general', 'debug'): print("Logging exception %s" % value) exception = {'type': typ.__name__, 'message': str(value), 'traceback': ''.join(format_tb(traceback))} # Update object in DB db = open_or_create_db() db.update({"exception": exception}, eids=[RUN_ID]) db.close() # Done logging, call default exception handler sys.__excepthook__(typ, value, traceback)
def log_output(filename, source): if type(filename) is not str: try: filename = filename.name except: pass filename = os.path.abspath(filename) if option_set('general', 'debug'): print("Output to %s using %s" % (filename, source)) #Update object in DB db = open_or_create_db() db.update(append("outputs", filename, no_duplicates=True), eids=[RUN_ID]) db.close()
def log_exception(typ, value, traceback): if option_set('general', 'debug'): print("Logging exception %s" % value) exception = { 'type': typ.__name__, 'message': str(value), 'traceback': ''.join(format_tb(traceback)) } # Update object in DB db = open_or_create_db() db.update({"exception": exception}, eids=[RUN_ID]) db.close() # Done logging, call default exception handler sys.__excepthook__(typ, value, traceback)
def add_svn_info(run, scriptpath): """ Add information about the svn repository holding the source file to the database. """ try: svn_client = svn.local.LocalClient(scriptpath) svn_info = svn_client.info() run["svnrepo"] = svn_info["repository_root"] run["svncommit"] = svn_info["commit_revision"] if not option_set('ignored metadata', 'diff'): run['diff'] = svn_diff(svn_info["wc-info/wcroot-abspath"]) except (SvnException, ValueError, OSError): # We can't access svn info for some reason, so just skip it pass
def patch(self, mod): """Do the patching of `input_functions` and `output_functions` in `mod` using `input_wrapper` and `output_wrapper` respectively. """ for f in self.wrappers.functions: if not self._ignore(f): if option_set('general', 'debug'): msg = 'Patching {} function: {}'.format( f['type'], f['function']) print(msg) # The function that is returned by create_wrapper assumes that # the wrapper is created directly on the patch object (the # first argument of f is self). We have to fake that here. # Otherwise, there will be an error, because an argument is # missing: # TypeError f() takes exactly 5 arguments (4 given) setattr(self.__class__, 'wrapper', f['wrapper']) patch_function(mod, f['function'], self.wrapper) else: if option_set('general', 'debug'): print('Ignoring {} for: {}'.format(f['type'], self.modulename)) return mod
def patch(self, mod): """Do the patching of `input_functions` and `output_functions` in `mod` using `input_wrapper` and `output_wrapper` respectively. """ if not self._ignore_input(): for f in self.input_functions: if option_set('general', 'debug'): print('Patching input function: %s' % f) patch_function(mod, f, self.input_wrapper) else: if option_set('general', 'debug'): print('Ignoring inputs for: %s' % self.modulename) if not self._ignore_output(): for f in self.output_functions: if option_set('general', 'debug'): print('Patching output function: %s' % f) patch_function(mod, f, self.output_wrapper) else: if option_set('general', 'debug'): print('Ignoring outputs for: %s' % self.modulename) return mod
def log_warning(msg, typ, script, lineno, **kwargs): if option_set('general', 'debug'): print('Logging warning "%s"' % str(msg)) warning = { 'type': typ.__name__, 'message': str(msg), 'script': script, 'lineno': lineno } # Update object in DB db = open_or_create_db() db.update(append("warnings", warning, no_duplicates=True), eids=[RUN_ID]) db.close() # Done logging, print warning to stderr sys.stderr.write(warnings.formatwarning(msg, typ, script, lineno))
def dedupe_inputs(): """Remove inputs that are logged muliple times. Sometimes patched libraries use other patched libraries to open files. E.g., xarray internally uses netCDF4 to open netcdf files. If this happens, and recipy is configured to log file hashes, inputs are logged multiple times. Hashed inputs are stored as a list in the database, and tinydb does not automatically dedupe lists. Outputs do not need to be deduped, because file hashed are added after the run is finished, and tinydb can automatically dedupe strings. """ if option_set('ignored metadata', 'input_hashes'): return db = open_or_create_db() run = db.get(eid=RUN_ID) new_inputs = list(set([tuple(inp) for inp in run['inputs']])) db.update({'inputs': new_inputs}, eids=[RUN_ID]) db.close()
def output_file_diffs(): # Writing to output files is complete; we can now compute file diffs. if not option_set('data', 'file_diff_outputs'): return db = open_or_create_db() diffs_table = db.table('filediffs') diffs = diffs_table.search(Query().run_id == RUN_ID) for item in diffs: diff = difflib.unified_diff(open(item['tempfilename']).readlines(), open(item['filename']).readlines(), fromfile='before this run', tofile='after this run') diffs_table.update({'diff': ''.join([l for l in diff])}, eids=[item.eid]) # delete temporary file os.remove(item['tempfilename']) db.close()
def add_git_info(run, scriptpath): """Add information about the git repository holding the source file to the database""" try: repo = Repo(scriptpath, search_parent_directories=True) run["gitrepo"] = repo.working_dir run["gitcommit"] = repo.head.commit.hexsha run["gitorigin"] = get_origin(repo) if not option_set('ignored metadata', 'diff'): whole_diff = '' diffs = repo.index.diff(None, create_patch=True) for diff in diffs: whole_diff += "\n\n\n" + "--- {}\n+++ {}\n".format( diff.a_path, diff.b_path) + diff.diff.decode("utf-8") run['diff'] = whole_diff except (InvalidGitRepositoryError, ValueError): # We can't store git info for some reason, so just skip it pass
def add_git_info(run, scriptpath): try: repo = Repo(scriptpath, search_parent_directories=True) run["githash"] = git_hash_object(scriptpath) run["gitrepo"] = repo.working_dir run["gitcommit"] = repo.head.commit.hexsha try: run["gitorigin"] = repo.remotes.origin.url except: run["gitorigin"] = None if not option_set('ignored metadata', 'diff'): whole_diff = '' diffs = repo.index.diff(None, create_patch=True) for diff in diffs: whole_diff += "\n\n\n" + diff.diff.decode("utf-8") run['diff'] = whole_diff except (InvalidGitRepositoryError, ValueError): # We can't store git info for some reason, so just skip it pass
def log_values(custom_values=None, **kwargs): """ Log a custom value-key pairs into the database e.g, >>> log_values(a=1, b=2) >>> log_values({'c': 3, 'd': 4}) >>> log_values({'e': 5, 'f': 6}, g=7, h=8) """ # create dictionary of custom values from arguments custom_values = {} if custom_values is None else custom_values assert isinstance(custom_values, dict), \ "custom_values must be a dict. type(custom_values) = %s" % type(custom_values) custom_values.update(kwargs) # debugging if option_set('general', 'debug'): print('Logging custom values: %s' % str(custom_values)) # Update object in DB db = open_or_create_db() db.update(add_dict("custom_values", custom_values), eids=[RUN_ID]) db.close()
def log_init(): """Do the initial logging for a new run. Works out what script has been run, creates a new unique run ID, and gets the basic metadata. This is called when running `import recipy`. """ # Get the path of the script we're running # When running python -m recipy ..., during the recipy import argument 0 # is -c (for Python 2) or -m (for Python 3) and the script is argument 1 if sys.argv[0] in ['-c', '-m']: # Has the user called python -m recipy without further arguments? if len(sys.argv) < 2: return scriptpath = os.path.realpath(sys.argv[1]) cmd_args = sys.argv[2:] else: scriptpath = os.path.realpath(sys.argv[0]) cmd_args = sys.argv[1:] global RUN_ID # Open the database db = open_or_create_db() # Create the unique ID for this run guid = str(uuid.uuid4()) # Get general metadata, environment info, etc run = { "unique_id": guid, "author": getpass.getuser(), "description": "", "inputs": [], "outputs": [], "script": scriptpath, "command": sys.executable, "environment": [platform.platform(), "python " + sys.version.split('\n')[0]], "date": datetime.datetime.utcnow(), "command_args": " ".join(cmd_args), "warnings": [], "libraries": [get_version('recipy')] } if not option_set('ignored metadata', 'git'): try: repo = Repo(scriptpath, search_parent_directories=True) run["gitrepo"] = repo.working_dir run["gitcommit"] = repo.head.commit.hexsha run["gitorigin"] = get_origin(repo) if not option_set('ignored metadata', 'diff'): whole_diff = '' diffs = repo.index.diff(None, create_patch=True) for diff in diffs: whole_diff += "\n\n\n" + diff.diff.decode("utf-8") run['diff'] = whole_diff except (InvalidGitRepositoryError, ValueError): # We can't store git info for some reason, so just skip it pass # Put basics into DB RUN_ID = db.insert(run) # Print message if not option_set('general', 'quiet'): print("recipy run inserted, with ID %s" % (guid)) # check whether patched modules were imported before recipy was imported patches = db.table('patches') for p in patches.all(): if p['modulename'] in sys.modules: msg = 'not tracking inputs and outputs for {}; recipy was ' \ 'imported after this module'.format(p['modulename']) warnings.warn(msg, stacklevel=3) db.close() # Register exception hook so exceptions can be logged sys.excepthook = log_exception
def log_init(notebookName=None): """Do the initial logging for a new run. Works out what script has been run, creates a new unique run ID, and gets the basic metadata. This is called when running `import recipy`. """ notebookMode = get_notebook_mode() if notebookMode and notebookName is None: # Avoid first call without Notebook name return if notebookMode: scriptpath = notebookName cmd_args = sys.argv[1:] # Get the path of the script we're running # When running python -m recipy ..., during the recipy import argument 0 # is -c (for Python 2) or -m (for Python 3) and the script is argument 1 elif sys.argv[0] in ['-c', '-m']: # Has the user called python -m recipy without further arguments? if len(sys.argv) < 2: return scriptpath = os.path.realpath(sys.argv[1]) cmd_args = sys.argv[2:] else: scriptpath = os.path.realpath(sys.argv[0]) cmd_args = sys.argv[1:] global RUN_ID # Open the database db = open_or_create_db() # Create the unique ID for this run guid = str(uuid.uuid4()) # Get general metadata, environment info, etc run = { "unique_id": guid, "author": getpass.getuser(), "description": "", "inputs": [], "outputs": [], "script": scriptpath, "command": sys.executable, "environment": [platform.platform(), "python " + sys.version.split('\n')[0]], "date": datetime.datetime.utcnow(), "command_args": " ".join(cmd_args), "warnings": [], "libraries": [get_version('recipy')], "custom_values": {} } if not notebookName and not option_set('ignored metadata', 'git'): add_git_info(run, scriptpath) if not notebookName and not option_set('ignored metadata', 'svn'): add_svn_info(run, scriptpath) # Put basics into DB RUN_ID = db.insert(run) # Print message if not option_set('general', 'quiet'): print("recipy run inserted, with ID %s" % (guid)) # check whether patched modules were imported before recipy was imported patches = db.table('patches') for p in patches.all(): if p['modulename'] in sys.modules: msg = 'not tracking inputs and outputs for {}; recipy was ' \ 'imported after this module'.format(p['modulename']) warnings.warn(msg, stacklevel=3) db.close() # Register exception hook so exceptions can be logged sys.excepthook = log_exception
def _ignore_output(self): root_modulename = self.modulename.split('.')[0] return (option_set('ignored outputs', root_modulename) or option_set('ignored outputs', 'all'))
def _ignore_output(self): root_modulename = self.modulename.split('.')[0] return option_set('ignored outputs', root_modulename) or option_set('ignored outputs', 'all')
def _ignore(self, f): root_modulename = self.modulename.split('.')[0] opt = 'ignored {}s'.format(f['type']) return option_set(opt, root_modulename) or option_set(opt, 'all')