def collect_result(self, session): """Collect result from processed workers.""" try: workers, submissions = zip(*[ self._processing_worker_queue.get() for _ in range(self._processing_worker_queue.qsize()) ]) except ValueError: logger.info('No workers are currently waiting or processed.') if self.hunger_policy == 'sleep': time.sleep(5) elif self.hunger_policy == 'exit': self._poison_pill = True return for worker, (submission_id, submission_name) in zip(workers, submissions): if worker.status == 'running': self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) logger.info('Worker {} is still running'.format(worker)) time.sleep(0) else: logger.info('Collecting results from worker {}'.format(worker)) returncode, stderr = worker.collect_results() set_submission_state( session, submission_id, 'tested' if not returncode else 'training_error') set_submission_error_msg(session, submission_id, stderr) self._processed_submission_queue.put_nowait( (submission_id, submission_name)) worker.teardown()
def update_database_results(self, session): """Update the database with the results of ramp_test_submission.""" while not self._processed_submission_queue.empty(): submission_id, submission_name = \ self._processed_submission_queue.get_nowait() if 'error' in get_submission_state(session, submission_id): update_leaderboards(session, self._ramp_config['event_name']) update_all_user_leaderboards(session, self._ramp_config['event_name']) logger.info('Skip update for {} due to failure during the ' 'processing'.format(submission_name)) continue logger.info('Update the results obtained on each fold for ' '{}'.format(submission_name)) path_predictions = os.path.join( self._worker_config['predictions_dir'], submission_name ) set_predictions(session, submission_id, path_predictions) set_time(session, submission_id, path_predictions) set_scores(session, submission_id, path_predictions) set_bagged_scores(session, submission_id, path_predictions) set_submission_state(session, submission_id, 'scored') update_leaderboards(session, self._ramp_config['event_name']) update_all_user_leaderboards(session, self._ramp_config['event_name'])
def update_database_results(self, session): """Update the database with the results of ramp_test_submission.""" make_update_leaderboard = False while not self._processed_submission_queue.empty(): make_update_leaderboard = True submission_id, submission_name = \ self._processed_submission_queue.get_nowait() if 'error' in get_submission_state(session, submission_id): continue logger.info('Write info in database for submission {}'.format( submission_name)) path_predictions = os.path.join( self._worker_config['predictions_dir'], submission_name) # NOTE: In the past we were adding the predictions into the # database. Since they require too much space, we stop to store # them in the database and instead, keep it onto the disk. # set_predictions(session, submission_id, path_predictions) set_time(session, submission_id, path_predictions) set_scores(session, submission_id, path_predictions) set_bagged_scores(session, submission_id, path_predictions) set_submission_state(session, submission_id, 'scored') if make_update_leaderboard: logger.info('Update all leaderboards') update_leaderboards(session, self._ramp_config['event_name']) update_all_user_leaderboards(session, self._ramp_config['event_name'])
def fetch_from_db(self, session): """Fetch the submission from the database and create the workers.""" submissions = get_submissions(session, self._ramp_config['event_name'], state='new') if not submissions: return for submission_id, submission_name, _ in submissions: # do not train the sandbox submission submission = get_submission_by_id(session, submission_id) if not submission.is_not_sandbox: continue # create the worker worker = self.worker(self._worker_config, submission_name) set_submission_state(session, submission_id, 'sent_to_training') update_user_leaderboards( session, self._ramp_config['event_name'], submission.team.name, new_only=True, ) self._awaiting_worker_queue.put_nowait( (worker, (submission_id, submission_name))) logger.info('Submission {} added to the queue of submission to be ' 'processed'.format(submission_name))
def collect_result(self, session): """Collect result from processed workers.""" try: workers, submissions = zip(*[ self._processing_worker_queue.get() for _ in range(self._processing_worker_queue.qsize()) ]) except ValueError: if self.hunger_policy == 'sleep': time.sleep(5) elif self.hunger_policy == 'exit': self._poison_pill = True return for worker, (submission_id, submission_name) in zip(workers, submissions): dt = worker.time_since_last_status_check() if (dt is not None) and (dt < self.time_between_collection): self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) time.sleep(0) continue elif worker.status == 'running': self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) time.sleep(0) elif worker.status == 'retry': set_submission_state(session, submission_id, 'new') self._logger.info( f'Submission: {submission_id} has been interrupted. ' 'It will be added to queue again and retried.') worker.teardown() else: self._logger.info(f'Collecting results from worker {worker}') returncode, stderr = worker.collect_results() if returncode: if returncode == 124: self._logger.info( f'Worker {worker} killed due to timeout.') submission_status = 'training_error' elif returncode == 2: # Error occurred when downloading the logs submission_status = 'checking_error' else: self._logger.info( f'Worker {worker} killed due to an error ' f'during training: {stderr}') submission_status = 'training_error' else: submission_status = 'tested' set_submission_state(session, submission_id, submission_status) set_submission_error_msg(session, submission_id, stderr) self._processed_submission_queue.put_nowait( (submission_id, submission_name)) worker.teardown()
def launch_workers(self, session): """Launch the awaiting workers if possible.""" while (not self._processing_worker_queue.full() and not self._awaiting_worker_queue.empty()): worker, (submission_id, submission_name) = \ self._awaiting_worker_queue.get() logger.info('Starting worker: {}'.format(worker)) worker.setup() worker.launch_submission() set_submission_state(session, submission_id, 'training') self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) logger.info( 'Store the worker {} into the processing queue'.format(worker)) if self._processing_worker_queue.full(): logger.info('The processing queue is full. Waiting for a worker to' ' finish')
def collect_result(self, session): """Collect result from processed workers.""" try: workers, submissions = zip(*[ self._processing_worker_queue.get() for _ in range(self._processing_worker_queue.qsize()) ]) except ValueError: if self.hunger_policy == 'sleep': time.sleep(5) elif self.hunger_policy == 'exit': self._poison_pill = True return for worker, (submission_id, submission_name) in zip(workers, submissions): dt = worker.time_since_last_status_check() if dt is not None and dt < self.time_between_collection: self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) time.sleep(0) continue elif worker.status == 'running': self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) time.sleep(0) else: logger.info(f'Collecting results from worker {worker}') returncode, stderr = worker.collect_results() if returncode: if returncode == 124: logger.info( 'Worker {} killed due to timeout.'.format(worker)) else: logger.info(f'Worker {worker} killed due to an error ' 'during training') submission_status = 'training_error' else: submission_status = 'tested' set_submission_state(session, submission_id, submission_status) set_submission_error_msg(session, submission_id, stderr) self._processed_submission_queue.put_nowait( (submission_id, submission_name)) worker.teardown()
def launch_workers(self, session): """Launch the awaiting workers if possible.""" while (not self._processing_worker_queue.full() and not self._awaiting_worker_queue.empty()): worker, (submission_id, submission_name) = \ self._awaiting_worker_queue.get() self._logger.info(f'Starting worker: {worker}') try: worker.setup() if worker.status != "error": worker.launch_submission() except Exception as e: self._logger.error( f'Worker finished with unhandled exception:\n {e}') worker.status = 'error' if worker.status == 'error': set_submission_state(session, submission_id, 'checking_error') worker.teardown() # kill the worker self._logger.info(f'Worker {worker} killed due to an error ' f'while connecting to AWS worker') stderr = ("There was a problem with sending your submission" " for training. This problem is on RAMP side" " and most likely it is not related to your" " code. If this happened for the first time" " to this submission you might" " consider submitting the same code once again." " Else, please contact the event organizers.") set_submission_error_msg(session, submission_id, stderr) continue set_submission_state(session, submission_id, 'training') submission = get_submission_by_id(session, submission_id) update_user_leaderboards( session, self._ramp_config['event_name'], submission.team.name, new_only=True, ) self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) self._logger.info( f'Store the worker {worker} into the processing queue')
def test_score_submission(session_scope_module): submission_id = 9 multi_index = pd.MultiIndex.from_product( [[0, 1], ['train', 'valid', 'test']], names=['fold', 'step'] ) expected_df = pd.DataFrame( {'acc': [0.604167, 0.583333, 0.733333, 0.604167, 0.583333, 0.733333], 'error': [0.395833, 0.416667, 0.266667, 0.395833, 0.416667, 0.266667], 'nll': [0.732763, 2.194549, 0.693464, 0.746132, 2.030762, 0.693992], 'f1_70': [0.333333, 0.33333, 0.666667, 0.33333, 0.33333, 0.666667]}, index=multi_index ) path_results = os.path.join(HERE, 'data', 'iris_predictions') with pytest.raises(ValueError, match='Submission state must be "tested"'): score_submission(session_scope_module, submission_id) set_submission_state(session_scope_module, submission_id, 'tested') set_predictions(session_scope_module, submission_id, path_results) score_submission(session_scope_module, submission_id) scores = get_scores(session_scope_module, submission_id) assert_frame_equal(scores, expected_df, check_less_precise=True)
def launch_workers(self, session): """Launch the awaiting workers if possible.""" while (not self._processing_worker_queue.full() and not self._awaiting_worker_queue.empty()): worker, (submission_id, submission_name) = \ self._awaiting_worker_queue.get() logger.info('Starting worker: {}'.format(worker)) worker.setup() if worker.status == 'error': set_submission_state(session, submission_id, 'checking_error') continue worker.launch_submission() if worker.status == 'error': set_submission_state(session, submission_id, 'checking_error') continue set_submission_state(session, submission_id, 'training') submission = get_submission_by_id(session, submission_id) update_user_leaderboards( session, self._ramp_config['event_name'], submission.team.name, new_only=True, ) self._processing_worker_queue.put_nowait( (worker, (submission_id, submission_name))) logger.info( 'Store the worker {} into the processing queue'.format(worker))
def launch(self): """Launch the dispatcher.""" logger.info('Starting the RAMP dispatcher') with session_scope(self._database_config) as session: logger.info('Open a session to the database') try: while not self._poison_pill: self.fetch_from_db(session) self.launch_workers(session) self.collect_result(session) self.update_database_results(session) finally: # reset the submissions to 'new' in case of error or unfinished # training submissions = get_submissions(session, self._ramp_config['event_name'], state=None) for submission_id, _, _ in submissions: submission_state = get_submission_state( session, submission_id) if submission_state in ('training', 'send_to_training'): set_submission_state(session, submission_id, 'new') logger.info('Dispatcher killed by the poison pill')
def update_database_results(self, session): """Update the database with the results of ramp_test_submission.""" make_update_leaderboard = False while not self._processed_submission_queue.empty(): make_update_leaderboard = True submission_id, submission_name = \ self._processed_submission_queue.get_nowait() if 'error' in get_submission_state(session, submission_id): continue logger.info('Write info in data base for submission {}'.format( submission_name)) path_predictions = os.path.join( self._worker_config['predictions_dir'], submission_name) set_predictions(session, submission_id, path_predictions) set_time(session, submission_id, path_predictions) set_scores(session, submission_id, path_predictions) set_bagged_scores(session, submission_id, path_predictions) set_submission_state(session, submission_id, 'scored') if make_update_leaderboard: logger.info('Update all leaderboards') update_leaderboards(session, self._ramp_config['event_name']) update_all_user_leaderboards(session, self._ramp_config['event_name'])
def test_make_submission_resubmission(base_db): # check that resubmitting the a submission with the same name will raise # an error session = base_db config = ramp_config_template() event_name, username = _setup_sign_up(session) ramp_config = generate_ramp_config(read_config(config)) # submitting the starting_kit which is used as the default submission for # the sandbox should raise an error err_msg = ('Submission "starting_kit" of team "test_user" at event ' '"iris_test" exists already') with pytest.raises(DuplicateSubmissionError, match=err_msg): add_submission(session, event_name, username, os.path.basename(ramp_config['ramp_sandbox_dir']), ramp_config['ramp_sandbox_dir']) # submitting twice a normal submission should raise an error as well submission_name = 'random_forest_10_10' path_submission = os.path.join( os.path.dirname(ramp_config['ramp_sandbox_dir']), submission_name) # first submission add_submission( session, event_name, username, submission_name, path_submission, ) # mock that we scored the submission set_submission_state(session, 5, 'scored') # second submission err_msg = ('Submission "random_forest_10_10" of team "test_user" at event ' '"iris_test" exists already') with pytest.raises(DuplicateSubmissionError, match=err_msg): add_submission(session, event_name, username, submission_name, path_submission) # a resubmission can take place if it is tagged as "new" or failed # mock that the submission failed during the training set_submission_state(session, 5, 'training_error') add_submission(session, event_name, username, submission_name, path_submission) # mock that the submissions are new submissions set_submission_state(session, 5, 'new') add_submission(session, event_name, username, submission_name, path_submission)
def _reset_submission_after_failure(session, even_name): submissions = get_submissions(session, even_name, state=None) for submission_id, _, _ in submissions: submission_state = get_submission_state(session, submission_id) if submission_state in ('training', 'send_to_training'): set_submission_state(session, submission_id, 'new')
def test_set_submission_state_unknown_state(session_scope_module): with pytest.raises(UnknownStateError, match='Unrecognized state'): set_submission_state(session_scope_module, 2, 'unknown')
def test_set_submission_state(session_scope_module): submission_id = 2 set_submission_state(session_scope_module, submission_id, 'trained') state = get_submission_state(session_scope_module, submission_id) assert state == 'trained'