def file_current(self, fname, md5): """Checksum a file and compare the md5 with the known md5 """ return os.path.isfile(fname) and util.md5_file(fname) == md5
def _sync_etc(self, headless=False): # Ignore SIGQUIT (ctrl-\). The child process will # handle it, and we'll # exit when the child process does. # # We disable these signals after running the process so the child doesn't # inherit this behaviour. try: signal.signal(signal.SIGQUIT, signal.SIG_IGN) except AttributeError: # SIGQUIT doesn't exist on windows pass exitcode = None try: while True: res = bytearray() try: res = self._socket.recv(2) except socket.timeout: pass if len(res) == 2 and res[0] == 2: exitcode = res[1] break elif len(res) > 0: wandb.termerror( "Invalid message received from child process: %s" % str(res)) break else: exitcode = self.proc.poll() if exitcode is not None: break time.sleep(1) except KeyboardInterrupt: exitcode = 255 wandb.termlog('Ctrl-c pressed; waiting for program to end.') keyboard_interrupt_time = time.time() if not headless: # give the process a couple of seconds to die, then kill it while self.proc.poll() is None and ( time.time() - keyboard_interrupt_time) < 2: time.sleep(0.1) if self.proc.poll() is None: wandb.termlog('Program still alive. Killing it.') try: self.proc.kill() except OSError: pass """TODO(adrian): garbage that appears in the logs sometimes Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x111adce48>> Traceback (most recent call last): File "/Users/adrian/.pyenv/versions/3.6.0/Python.framework/Versions/3.6/lib/python3.6/subprocess.py", line 760, in __del__ AttributeError: 'NoneType' object has no attribute 'warn' """ wandb.termlog() if exitcode is None: exitcode = 254 wandb.termlog( 'Killing program failed; syncing files anyway. Press ctrl-c to abort syncing.' ) else: if exitcode == 0: wandb.termlog('Program ended.') else: wandb.termlog( 'Program failed with code %d. Press ctrl-c to abort syncing.' % exitcode) #termlog('job (%s) Process exited with code: %s' % (program, exitcode)) self._meta.data["exitcode"] = exitcode if exitcode == 0: self._meta.data["state"] = "finished" elif exitcode == 255: self._meta.data["state"] = "killed" else: self._meta.data["state"] = "failed" self._meta.shutdown() self._system_stats.shutdown() self._close_stdout_stderr_streams(exitcode or 254) # If we're not syncing to the cloud, we're done if not self._cloud: self._socket.done() return None # Show run summary/history self._run.summary.load() summary = self._run.summary._summary if len(summary): wandb.termlog('Run summary:') max_len = max([len(k) for k in summary.keys()]) format_str = ' {:>%s} {}' % max_len for k, v in summary.items(): wandb.termlog(format_str.format(k, v)) self._run.history.load() history_keys = self._run.history.keys() if len(history_keys): wandb.termlog('Run history:') max_len = max([len(k) for k in history_keys]) for key in history_keys: vals = util.downsample(self._run.history.column(key), 40) line = sparkline.sparkify(vals) format_str = u' {:>%s} {}' % max_len wandb.termlog(format_str.format(key, line)) if self._run.has_examples: wandb.termlog('Saved %s examples' % self._run.examples.count()) wandb.termlog('Waiting for final file modifications.') # This is a a heuristic delay to catch files that were written just before # the end of the script. # TODO: ensure we catch all saved files. # TODO(adrian): do we need this? time.sleep(2) try: # avoid hanging if we crashed before the observer was started if self._observer.is_alive(): self._observer.stop() self._observer.join() # TODO: py2 TypeError: PyCObject_AsVoidPtr called with null pointer except TypeError: pass # TODO: py3 SystemError: <built-in function stop> returned a result with an error set except SystemError: pass for handler in self._event_handlers.values(): handler.finish() self._file_pusher.finish() wandb.termlog('Syncing files in %s:' % os.path.relpath(self._watch_dir)) for file_path in self._stats.files(): wandb.termlog(' %s' % os.path.relpath(file_path, self._watch_dir)) step = 0 spinner_states = ['-', '\\', '|', '/'] stop = False self._stats.update_all_files() while True: if not self._file_pusher.is_alive(): stop = True summary = self._stats.summary() line = ( ' %(completed_files)s of %(total_files)s files,' ' %(uploaded_bytes).03f of %(total_bytes).03f bytes uploaded\r' % summary) line = spinner_states[step % 4] + line step += 1 wandb.termlog(line, newline=False) if stop: break time.sleep(0.25) #print('FP: ', self._file_pusher._pending, self._file_pusher._jobs) # clear progress line. wandb.termlog(' ' * 79) # Check md5s of uploaded files against what's on the file system. # TODO: We're currently using the list of uploaded files as our source # of truth, but really we should use the files on the filesystem # (ie if we missed a file this wouldn't catch it). # This polls the server, because there a delay between when the file # is done uploading, and when the datastore gets updated with new # metadata via pubsub. wandb.termlog('Verifying uploaded files... ', newline=False) error = False mismatched = None for delay_base in range(4): mismatched = [] download_urls = self._api.download_urls(self._project, run=self._run.id) for fname, info in download_urls.items(): if fname == 'wandb-history.h5' or OUTPUT_FNAME: continue local_path = os.path.join(self._watch_dir, fname) local_md5 = util.md5_file(local_path) if local_md5 != info['md5']: mismatched.append((local_path, local_md5, info['md5'])) if not mismatched: break wandb.termlog(' Retrying after %ss' % (delay_base**2)) time.sleep(delay_base**2) if mismatched: print('') error = True for local_path, local_md5, remote_md5 in mismatched: wandb.termerror( '%s (%s) did not match uploaded file (%s) md5' % (local_path, local_md5, remote_md5)) else: print('verified!') if error: wandb.termerror('Sync failed %s' % self.url) else: wandb.termlog('Synced %s' % self.url) if headless: self._socket.done()