def stop(self, wait_for_running_epochs=True):
     '''Stops Training Coordinator. If chief, it waits for all epochs to be
     'done' and then shuts down the web server.
     '''
     if self.is_chief and self._thread:
         if wait_for_running_epochs:
             while len(self._epochs_running) > 0:
                 log_traffic('Coordinator is waiting for epochs to finish...')
                 time.sleep(5)
         log_debug('Stopping coordinator...')
         self._httpd.shutdown()
         log_debug('Coordinator stopped.')
    def finish_job(self, job):
        '''Finishes a running job. Removes it from the running jobs list and adds it to the done jobs list.

        Args:
            job (WorkerJob): the job to put into state 'done'
        '''
        index = next((i for i in range(len(self.jobs_running)) if self.jobs_running[i].id == job.id), -1)
        if index >= 0:
            self.jobs_running.pop(index)
            self.jobs_done.append(job)
            log_traffic('%s - Moved %s from running to done.' % (self.name(), job))
        else:
            log_warn('%s - There is no job with ID %d registered as running.' % (self.name(), job.id))
示例#3
0
    def get_job(self, worker=0):
        '''Retrieves the first job for a worker.

        Kwargs:
            worker (int): index of the worker to get the first job for

        Returns:
            WorkerJob. a job of one of the running epochs that will get
                associated with the given worker and put into state 'running'
        '''
        # Let's ensure that this does not interfere with other workers/requests
        with self._lock:
            if self.is_chief:
                # First try to get a next job
                job = self._get_job(worker)

                if job is None:
                    # If there was no next job, we give it a second chance by triggering the epoch state machine
                    if self._next_epoch():
                        # Epoch state machine got a new epoch
                        # Second try to get a next job
                        job = self._get_job(worker)
                        if job is None:
                            # Albeit the epoch state machine got a new epoch, the epoch had no new job for us
                            log_error(
                                'Unexpected case - no job for worker %d.' %
                                (worker))
                        return job

                    # Epoch state machine has no new epoch
                    # This happens at the end of the whole training - nothing to worry about
                    log_traffic('No jobs left for worker %d.' % (worker))
                    self._log_all_jobs()
                    return None

                # We got a new job from one of the currently running epochs
                log_traffic('Got new %s' % job)
                return job

            # We are a remote worker and have to hand over to the chief worker by HTTP
            result = self._talk_to_chief(PREFIX_GET_JOB +
                                         str(FLAGS.task_index))
            if result:
                result = pickle.loads(result)
            return result
示例#4
0
    def get_job(self, worker=0):
        '''Retrieves the first job for a worker.

        Kwargs:
            worker (int): index of the worker to get the first job for

        Returns:
            WorkerJob. a job of one of the running epochs that will get
                associated with the given worker and put into state 'running'
        '''
        # Let's ensure that this does not interfere with other workers/requests
        with self._lock:
            if self.is_chief:
                # First try to get a next job
                job = self._get_job(worker)

                if job is None:
                    # If there was no next job, we give it a second chance by triggering the epoch state machine
                    if self._next_epoch():
                        # Epoch state machine got a new epoch
                        # Second try to get a next job
                        job = self._get_job(worker)
                        if job is None:
                            # Albeit the epoch state machine got a new epoch, the epoch had no new job for us
                            log_error('Unexpected case - no job for worker %d.' % (worker))
                        return job

                    # Epoch state machine has no new epoch
                    # This happens at the end of the whole training - nothing to worry about
                    log_traffic('No jobs left for worker %d.' % (worker))
                    self._log_all_jobs()
                    return None

                # We got a new job from one of the currently running epochs
                log_traffic('Got new %s' % job)
                return job

            # We are a remote worker and have to hand over to the chief worker by HTTP
            result = self._talk_to_chief(PREFIX_GET_JOB + str(FLAGS.task_index))
            if result:
                result = pickle.loads(result)
            return result
示例#5
0
 def _talk_to_chief(self, path, data=None, default=None):
     tries = 0
     while tries < FLAGS.coord_retries:
         tries += 1
         try:
             url = 'http://%s:%d%s' % (FLAGS.coord_host, FLAGS.coord_port,
                                       path)
             log_traffic('Contacting coordinator - url: %s, tries: %d ...' %
                         (url, tries - 1))
             res = urllib.request.urlopen(
                 urllib.request.Request(url, data,
                                        {'content-type': 'text/plain'}))
             str = res.read()
             status = res.getcode()
             log_traffic('Coordinator responded - url: %s, status: %s' %
                         (url, status))
             if status == 200:
                 return str
             if status == 204:  # We use 204 (no content) to indicate end of training
                 return default
         except urllib.error.HTTPError as error:
             log_traffic(
                 'Problem reaching coordinator - url: %s, HTTP code: %d' %
                 (url, error.code))
             pass
         time.sleep(10)
     return default
    def get_next_index(self, set_name):
        '''Retrives a new cluster-unique batch index for a given set-name.
        Prevents applying one batch multiple times per epoch.

        Args:
            set_name (str): name of the data set - one of 'train', 'dev'

        Returns:
            int. new data set index
        '''
        with self._lock:
            if self.is_chief:
                member = '_index_' + set_name
                value = getattr(self, member, -1)
                setattr(self, member, value + 1)
                return value
            else:
                # We are a remote worker and have to hand over to the chief worker by HTTP
                log_traffic('Asking for next index...')
                value = int(self._talk_to_chief(PREFIX_NEXT_INDEX + set_name))
                log_traffic('Got index %d.' % value)
                return value
示例#7
0
 def _talk_to_chief(self, path, data=None, default=None):
     tries = 0
     while tries < FLAGS.coord_retries:
         tries += 1
         try:
             url = 'http://%s:%d%s' % (FLAGS.coord_host, FLAGS.coord_port, path)
             log_traffic('Contacting coordinator - url: %s, tries: %d ...' % (url, tries-1))
             res = urllib.request.urlopen(urllib.request.Request(url, data, { 'content-type': 'text/plain' }))
             str = res.read()
             status = res.getcode()
             log_traffic('Coordinator responded - url: %s, status: %s' % (url, status))
             if status == 200:
                 return str
             if status == 204: # We use 204 (no content) to indicate end of training
                 return default
         except urllib.error.HTTPError as error:
             log_traffic('Problem reaching coordinator - url: %s, HTTP code: %d' % (url, error.code))
             pass
         time.sleep(10)
     return default