def stop(self, wait_for_running_epochs=True): '''Stops Training Coordinator. If chief, it waits for all epochs to be 'done' and then shuts down the web server. ''' if self.is_chief and self._thread: if wait_for_running_epochs: while len(self._epochs_running) > 0: log_traffic('Coordinator is waiting for epochs to finish...') time.sleep(5) log_debug('Stopping coordinator...') self._httpd.shutdown() log_debug('Coordinator stopped.')
def finish_job(self, job): '''Finishes a running job. Removes it from the running jobs list and adds it to the done jobs list. Args: job (WorkerJob): the job to put into state 'done' ''' index = next((i for i in range(len(self.jobs_running)) if self.jobs_running[i].id == job.id), -1) if index >= 0: self.jobs_running.pop(index) self.jobs_done.append(job) log_traffic('%s - Moved %s from running to done.' % (self.name(), job)) else: log_warn('%s - There is no job with ID %d registered as running.' % (self.name(), job.id))
def get_job(self, worker=0): '''Retrieves the first job for a worker. Kwargs: worker (int): index of the worker to get the first job for Returns: WorkerJob. a job of one of the running epochs that will get associated with the given worker and put into state 'running' ''' # Let's ensure that this does not interfere with other workers/requests with self._lock: if self.is_chief: # First try to get a next job job = self._get_job(worker) if job is None: # If there was no next job, we give it a second chance by triggering the epoch state machine if self._next_epoch(): # Epoch state machine got a new epoch # Second try to get a next job job = self._get_job(worker) if job is None: # Albeit the epoch state machine got a new epoch, the epoch had no new job for us log_error( 'Unexpected case - no job for worker %d.' % (worker)) return job # Epoch state machine has no new epoch # This happens at the end of the whole training - nothing to worry about log_traffic('No jobs left for worker %d.' % (worker)) self._log_all_jobs() return None # We got a new job from one of the currently running epochs log_traffic('Got new %s' % job) return job # We are a remote worker and have to hand over to the chief worker by HTTP result = self._talk_to_chief(PREFIX_GET_JOB + str(FLAGS.task_index)) if result: result = pickle.loads(result) return result
def get_job(self, worker=0): '''Retrieves the first job for a worker. Kwargs: worker (int): index of the worker to get the first job for Returns: WorkerJob. a job of one of the running epochs that will get associated with the given worker and put into state 'running' ''' # Let's ensure that this does not interfere with other workers/requests with self._lock: if self.is_chief: # First try to get a next job job = self._get_job(worker) if job is None: # If there was no next job, we give it a second chance by triggering the epoch state machine if self._next_epoch(): # Epoch state machine got a new epoch # Second try to get a next job job = self._get_job(worker) if job is None: # Albeit the epoch state machine got a new epoch, the epoch had no new job for us log_error('Unexpected case - no job for worker %d.' % (worker)) return job # Epoch state machine has no new epoch # This happens at the end of the whole training - nothing to worry about log_traffic('No jobs left for worker %d.' % (worker)) self._log_all_jobs() return None # We got a new job from one of the currently running epochs log_traffic('Got new %s' % job) return job # We are a remote worker and have to hand over to the chief worker by HTTP result = self._talk_to_chief(PREFIX_GET_JOB + str(FLAGS.task_index)) if result: result = pickle.loads(result) return result
def _talk_to_chief(self, path, data=None, default=None): tries = 0 while tries < FLAGS.coord_retries: tries += 1 try: url = 'http://%s:%d%s' % (FLAGS.coord_host, FLAGS.coord_port, path) log_traffic('Contacting coordinator - url: %s, tries: %d ...' % (url, tries - 1)) res = urllib.request.urlopen( urllib.request.Request(url, data, {'content-type': 'text/plain'})) str = res.read() status = res.getcode() log_traffic('Coordinator responded - url: %s, status: %s' % (url, status)) if status == 200: return str if status == 204: # We use 204 (no content) to indicate end of training return default except urllib.error.HTTPError as error: log_traffic( 'Problem reaching coordinator - url: %s, HTTP code: %d' % (url, error.code)) pass time.sleep(10) return default
def get_next_index(self, set_name): '''Retrives a new cluster-unique batch index for a given set-name. Prevents applying one batch multiple times per epoch. Args: set_name (str): name of the data set - one of 'train', 'dev' Returns: int. new data set index ''' with self._lock: if self.is_chief: member = '_index_' + set_name value = getattr(self, member, -1) setattr(self, member, value + 1) return value else: # We are a remote worker and have to hand over to the chief worker by HTTP log_traffic('Asking for next index...') value = int(self._talk_to_chief(PREFIX_NEXT_INDEX + set_name)) log_traffic('Got index %d.' % value) return value
def _talk_to_chief(self, path, data=None, default=None): tries = 0 while tries < FLAGS.coord_retries: tries += 1 try: url = 'http://%s:%d%s' % (FLAGS.coord_host, FLAGS.coord_port, path) log_traffic('Contacting coordinator - url: %s, tries: %d ...' % (url, tries-1)) res = urllib.request.urlopen(urllib.request.Request(url, data, { 'content-type': 'text/plain' })) str = res.read() status = res.getcode() log_traffic('Coordinator responded - url: %s, status: %s' % (url, status)) if status == 200: return str if status == 204: # We use 204 (no content) to indicate end of training return default except urllib.error.HTTPError as error: log_traffic('Problem reaching coordinator - url: %s, HTTP code: %d' % (url, error.code)) pass time.sleep(10) return default