def open_job_history (history_file): """ Given a history file name, opens the correspond sqllite db file and returns the handle """ if not history_file: history_file = get_default_history_file_name () return dbdict.open(history_file, picklevalues=True)
def needs_update_check_modify_time (*params, **kwargs): """ Given input and output files, see if all exist and whether output files are later than input files Each can be #. string: assumed to be a filename "file1" #. any other type #. arbitrary nested sequence of (1) and (2) """ # conditions for rerunning a job: # 1. forced to rerun entire taskset # 2. 1+ Output files don't exist # 3. 1+ of input files is newer than 1+ output files -- ruffus does this level right now... # 4. internal completion time for that file is out of date # incomplete runs will be rerun automatically # 5. checksum of code that ran the file is out of date # changes to function body result in rerun # 6. checksum of the args that ran the file are out of date # appropriate config file changes result in rerun try: task = kwargs['task'] except KeyError: # allow the task not to be specified and fall back to classic # file timestamp behavior (either this or fix all the test cases, # which often don't have proper tasks) class Namespace: pass task = Namespace() task.checksum_level = CHECKSUM_FILE_TIMESTAMPS job_history = dbdict.open(RUFFUS_HISTORY_FILE, picklevalues=True) # missing output means build if len(params) < 2: return True i, o = params[0:2] i = get_strings_in_nested_sequence(i) o = get_strings_in_nested_sequence(o) # # build: missing output file # if len(o) == 0: return True, "Missing output file" # missing input / output file means always build missing_files = [] for io in (i, o): for p in io: if not os.path.exists(p): missing_files.append(p) if len(missing_files): return True, "Missing file%s [%s]" % ("s" if len(missing_files) > 1 else "", ", ".join(missing_files)) # existing files, but from previous interrupted runs if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS: incomplete_files = [] func_changed_files = [] param_changed_files = [] #for io in (i, o): # for p in io: # if p not in job_history: # incomplete_files.append(p) for p in o: if p not in job_history: incomplete_files.append(p) if len(incomplete_files): return True, "Previous incomplete run leftover%s: [%s]" % ("s" if len(incomplete_files) > 1 else "", ", ".join(incomplete_files)) # check if function that generated our output file has changed for p in o: old_chksum = job_history[p] new_chksum = JobHistoryChecksum(p, None, params[2:], task) if task.checksum_level >= CHECKSUM_FUNCTIONS_AND_PARAMS and \ new_chksum.chksum_params != old_chksum.chksum_params: param_changed_files.append(p) elif task.checksum_level >= CHECKSUM_FUNCTIONS and \ new_chksum.chksum_func != old_chksum.chksum_func: func_changed_files.append(p) if len(func_changed_files): return True, "Pipeline function has changed: [%s]" % (", ".join(func_changed_files)) if len(param_changed_files): return True, "Pipeline parameters have changed: [%s]" % (", ".join(param_changed_files)) # # missing input -> build only if output absent or function is out of date # if len(i) == 0: return False, "Missing input files" # # get sorted modified times for all input and output files # filename_to_times = [[], []] file_times = [[], []] #_____________________________________________________________________________________ # pretty_io_with_date_times #_____________________________________________________________________________________ def pretty_io_with_date_times (filename_to_times): # sort for io in range(2) : filename_to_times[io].sort() # # add asterisk for all files which are causing this job to be out of date # file_name_to_asterisk = dict() oldest_output_mtime = filename_to_times[1][0][0] for mtime, file_name in filename_to_times[0]: file_name_to_asterisk[file_name] = "*" if mtime >= oldest_output_mtime else " " newest_output_mtime = filename_to_times[0][-1][0] for mtime, file_name in filename_to_times[1]: file_name_to_asterisk[file_name] = "*" if mtime <= newest_output_mtime else " " # # try to fit in 100 - 15 = 85 char lines # date time ~ 25 characters so limit file name to 55 characters # msg = "\n" category_names = "Input", "Output" for io in range(2): msg += " %s files:\n" % category_names[io] for mtime, file_name in filename_to_times[io]: file_datetime_str = epoch_seconds_to_str(mtime) msg += (" " + # indent file_name_to_asterisk[file_name] + " " + # asterisked out of date files file_datetime_str + ": " + # date time of file get_readable_path_str(file_name, 55) + "\n") # file name truncated to 55 return msg # # Ignore output file if it is found in the list of input files # By definition they have the same timestamp, # and the job will otherwise appear to be out of date # # Symbolic links followed real_input_file_names = set() for input_file_name in i: real_input_file_names.add(os.path.realpath(input_file_name)) if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS and input_file_name in job_history: mtime = max(os.path.getmtime(input_file_name), job_history[input_file_name].mtime) else: mtime = os.path.getmtime(input_file_name) filename_to_times[0].append((mtime, input_file_name)) file_times[0].append(mtime) # for output files, we need to check modification time *in addition* to # function and argument checksums... for output_file_name in o: real_file_name = os.path.realpath(output_file_name) if task.checksum_level >= CHECKSUM_HISTORY_TIMESTAMPS: old_chksum = job_history[output_file_name] mtime = min(os.path.getmtime(output_file_name), old_chksum.mtime) else: mtime = os.path.getmtime(output_file_name) if real_file_name not in real_input_file_names: file_times[1].append(mtime) filename_to_times[1].append((mtime, output_file_name)) # # Debug: Force print modified file names and times # #if len(file_times[0]) and len (file_times[1]): # print >>sys.stderr, pretty_io_with_date_times(filename_to_times), file_times, (max(file_times[0]) >= min(file_times[1])) #else: # print >>sys.stderr, i, o # # update if any input file >= (more recent) output file # if len(file_times[0]) and len (file_times[1]) and max(file_times[0]) >= min(file_times[1]): return True, pretty_io_with_date_times(filename_to_times) return False, "Up to date"