def maybe_start_unassigned_runner() -> None: """Start an unassigned runner if we have jobs that need a runner. """ # Do not use count so we can lock all these runners, this makes sure we # will not create new runners while we are counting. active_runners = models.Runner.get_all_active_runners() not_assigned_runners = len( active_runners.filter_by(job_id=None).with_for_update().all()) sql = db.session.query( t.cast(DbColumn[int], models.Job._wanted_runners), # pylint: disable=protected-access t.cast(t.Type[int], sql_func.count(models.Runner.id))).filter( models.Job._state != models.JobState.finished, # pylint: disable=protected-access ).join(models.Runner, and_( models.Runner.job_id == models.Job.id, t.cast(DbColumn[models.RunnerState], models.Runner.state).in_( models.RunnerState.get_active_states())), isouter=True).group_by(models.Job.id) needed_runners = sum(max(0, wanted - has) for wanted, has in sql) with bound_to_logger( needed_runners=needed_runners, unassigned_runners=not_assigned_runners, ): if needed_runners < not_assigned_runners: logger.error('More runners than jobs active') elif needed_runners == not_assigned_runners: logger.info('No new runners needed') else: logger.info('More runners are needed') start_unassigned_runner(needed_runners)
def _adjust_amount_runners_1(auto_test_run_id: int) -> None: run = p.models.AutoTestRun.query.filter_by( id=auto_test_run_id).with_for_update().one_or_none() if run is None: logger.warning('Run to adjust not found') return elif run.finished: logger.info('Run already finished, no adjustments needed') return requested_amount = run.runners_requested needed_amount = run.get_amount_needed_runners() with cg_logger.bound_to_logger(requested_amount=requested_amount, needed_amount=needed_amount): if needed_amount == requested_amount: logger.info('No new runners needed') return elif needed_amount == 0 and run.runners_requested < 2: # If we have requested more than 2 runners we should decrease this, # so do send this request to the broker. logger.info("We don't need any runners") return elif needed_amount > requested_amount: logger.info('We need more runners') else: logger.info('We have requested too many runners') _notify_broker_of_new_job_1(run, needed_amount)
def maybe_kill_unneeded_runner(runner_hex_id: str) -> None: """Maybe kill the given runner if it is still unassigned. :param runner_hex_id: The hex id of the runner to maybe kill. """ # TODO: If a runner goes from unassigned, to assigned, to unassigned we # might kill it too soon. As we start a job to kill it for the first time # it is unassigned, and this job might be executed while it is unassigned # for the second time. runner_id = uuid.UUID(hex=runner_hex_id) runner = db.session.query( models.Runner).filter_by(id=runner_id).with_for_update().one_or_none() with bound_to_logger(runner=runner): if runner is None: logger.warning('Runner was deleted') return elif not runner.is_before_run: logger.info('Runner has already finished running', runner_state=runner.state) return elif runner.job_id is not None: logger.info('Runner is needed', job_of_runner=runner.job_id) return logger.error('Runner is still not doing anything') runner.kill(maybe_start_new=False, shutdown_only=False)
def _send_direct_notification_emails_1( notification_ids: t.List[int], ) -> None: notifications = p.models.db.session.query(p.models.Notification).filter( p.models.Notification.id.in_(notification_ids), p.models.Notification.email_sent_at.is_(None), ).with_for_update().all() should_send = p.models.NotificationsSetting.get_should_send_for_users( [n.receiver_id for n in notifications]) for notification in notifications: with cg_logger.bound_to_logger(notification=notification): if not should_send(notification, p.models.EmailNotificationTypes.direct): logger.info('Should not send notification') continue now = DatetimeWithTimezone.utcnow() try: p.mail.send_direct_notification_email(notification) # pylint: disable=broad-except except Exception: # pragma: no cover # This happens if mail sending fails or if the user has no # e-mail address. # TODO: make this exception more specific logger.warning( 'Could not send notification email', receiving_user_id=notification.receiver_id, exc_info=True, report_to_sentry=True, ) else: notification.email_sent_at = now p.models.db.session.commit()
def _adjust_amount_runners_1( auto_test_run_id: int, *, always_update_latest_results: bool = False) -> None: run = p.models.AutoTestRun.query.filter_by( id=auto_test_run_id).with_for_update().one_or_none() if run is None: logger.warning('Run to adjust not found') return requested_amount = run.runners_requested needed_amount = run.get_amount_needed_runners() should_notify_broker = False with cg_logger.bound_to_logger(requested_amount=requested_amount, needed_amount=needed_amount): if needed_amount == requested_amount: logger.info('No new runners needed') elif needed_amount == 0 and run.runners_requested < 2: # If we have requested more than 2 runners we should decrease this, # so do send this request to the broker. logger.info("We don't need any runners") elif needed_amount > requested_amount: logger.info('We need more runners') should_notify_broker = True else: logger.info('We have requested too many runners') should_notify_broker = True if should_notify_broker: _notify_broker_of_new_job_1(run, needed_amount) elif always_update_latest_results: _update_latest_results_in_broker_1(run.id)
def start_runner(self) -> None: assert self.instance_id is None assert self._pass is not None client = boto3.client('ec2') ec2 = boto3.resource('ec2') images = client.describe_images( Owners=['self'], Filters=[{ 'Name': 'tag:AutoTest', 'Values': [app.config['AWS_TAG_VALUE']] }], ).get('Images', []) assert images image = max(images, key=lambda image: image['CreationDate']) logger.info( 'Creating AWS instance', instance_type=app.config['AWS_INSTANCE_TYPE'], image_id=image['ImageId'] ) inst, = ec2.create_instances( ImageId=image['ImageId'], InstanceType=app.config['AWS_INSTANCE_TYPE'], MaxCount=1, MinCount=1, UserData="""#!/bin/bash F="{config_dir}/.runner-pass" echo "{password}" > "$F" chmod 777 "$F" """.format( password=self._pass, config_dir=app.config['RUNNER_CONFIG_DIR'], ) ) with bound_to_logger(instance=inst): logger.info('Started AWS instance, waiting for network') self.instance_id = inst.id for _ in range(120): try: inst = ec2.Instance(inst.id) if inst.public_ip_address: logger.info('AWS instance up') self.ipaddr = inst.public_ip_address return except ClientError as e: err = e.response.get('Error', {}) if err.get('Code', '') != 'InvalidInstanceID.NotFound': raise time.sleep(1) logger.error('Timed out waiting on AWS instance') raise TimeoutError
def match_output( cls, stdout: str, expected_output: str, step_options: t.Iterable[str]) -> t.Tuple[bool, t.Optional[int]]: """Do the output matching of an IoTest. :param stdout: The stdout of the program, so the thing we got. :param expected_output: The expected output as provided by the teacher, this might be a regex. :param step_options: A list of options as given by the students. Valid options are 'regex', 'trailing_whitespace', 'case' and 'substring'. """ regex_flags = 0 to_test = stdout.rstrip('\n') step_options = set(step_options) if 'all_whitespace' in step_options: to_test = cls._remove_whitespace(to_test) expected_output = cls._remove_whitespace(expected_output) elif 'trailing_whitespace' in step_options: to_test = '\n'.join(line.rstrip() for line in to_test.splitlines()) expected_output = '\n'.join( line.rstrip() for line in expected_output.splitlines()) if 'case' in step_options: regex_flags |= re.IGNORECASE if 'regex' not in step_options: to_test = to_test.lower() expected_output = expected_output.lower() logger.info( 'Comparing output and expected output', to_test=to_test, expected_output=expected_output, step_options=step_options, ) if 'regex' in step_options: with cg_logger.bound_to_logger( output=expected_output, to_test=to_test, flags=regex_flags, ): try: match = re.search( expected_output, to_test, flags=regex_flags, timeout=2, ) except TimeoutError: logger.warning('Regex match timed out', exc_info=True) return False, -2 logger.info('Done with regex search', match=match) return bool(match), None elif 'substring' in step_options: return expected_output in to_test, None else: return expected_output == to_test, None
def maybe_start_more_runners() -> None: """Start more runners for jobs that still need runners. This assigns all unassigned runners if possible, and if needed it also starts new runners. """ # We might change the amount of unassigned runners during this task, so # maybe we need to start more. callback_after_this_request(start_needed_unassigned_runners.delay) # Do not use count so we can lock all these runners, this makes sure we # will not create new runners while we are counting. active_jobs = { job.id: job for job in db.session.query(models.Job).filter( models.Job.state != models.JobState.finished).with_for_update() } unassigned_runners = models.Runner.get_all_active_runners().filter_by( job_id=None).with_for_update().all() jobs_needed_runners = db.session.query( models.Job.id, models.Job.wanted_runners - sql_func.count(models.Runner.id), ).filter(models.Job.id.in_(list(active_jobs.keys())), ).join( models.Runner, and_(models.Runner.job_id == models.Job.id, models.Runner.state.in_(models.RunnerState.get_active_states())), isouter=True).having( models.Job.wanted_runners > sql_func.count( models.Runner.id), ).group_by(models.Job.id).all() with bound_to_logger( jobs_needed_runners=jobs_needed_runners, all_unassigned_runners=unassigned_runners, ): if jobs_needed_runners: startable = models.Runner.get_amount_of_startable_runners() logger.info('More runners are needed') for job_id, _ in jobs_needed_runners: # This never raises a key error as only jobs in this dictionary # are selected. job = active_jobs[job_id] startable -= job.add_runners_to_job(unassigned_runners, startable) db.session.commit() elif unassigned_runners: logger.error('More runners than jobs active') else: logger.info('No new runners needed')
def _send_delayed_notification_emails( digest_type: Literal[p.models.EmailNotificationTypes.daily, p.models. EmailNotificationTypes.weekly] ) -> None: now = DatetimeWithTimezone.utcnow() if digest_type == p.models.EmailNotificationTypes.daily: max_age = now - datetime.timedelta(days=1, hours=2) else: assert digest_type == p.models.EmailNotificationTypes.weekly max_age = now - datetime.timedelta(days=7, hours=2) notifications = p.models.db.session.query(p.models.Notification).filter( p.models.Notification.email_sent_at.is_(None), p.models.Notification.created_at > max_age, ).order_by(p.models.Notification.receiver_id).with_for_update().all() should_send = p.models.NotificationsSetting.get_should_send_for_users( list(set(n.receiver_id for n in notifications)) ) notifications_to_send = [] now = DatetimeWithTimezone.utcnow() for notification in notifications: with cg_logger.bound_to_logger( notification=notification.__structlog__() ): if not should_send(notification, digest_type): logger.info('Should not send notification') continue logger.info('Should send notification') notification.email_sent_at = now notifications_to_send.append(notification) p.models.db.session.commit() for user, user_notifications in itertools.groupby( notifications_to_send, lambda n: n.receiver ): try: p.mail.send_digest_notification_email( list(user_notifications), digest_type ) # pylint: disable=broad-except except Exception: # pragma: no cover logger.warning( 'Could not send digest email', receiving_user_id=user.id, exc_info=True, report_to_sentry=True, )
def _retry_vps_action(action_name: str, func: t.Callable[[], object], max_tries: int = 50) -> None: with bound_to_logger( action=action_name, ), timed_code('_retry_vps_action'): for idx in range(max_tries): try: func() except WebFault: logger.info('Could not perform action yet', exc_info=True) time.sleep(idx) else: break else: logger.error("Couldn't perform action") raise TimeoutError
def __task() -> None: receivers = p.helpers.get_in_or_error( p.models.User, p.models.User.id, receiver_ids, same_order_as_given=True, ) sender = p.models.User.query.get(sender_id) if sender is None: # pragma: no cover raise Exception('Wanted sender was not found') failed_receivers = [] with p.mail.mail.connect() as mailer: for receiver in receivers: with cg_logger.bound_to_logger(receiver=receiver): try: p.mail.send_student_mail( mailer, sender=sender, receiver=receiver, subject=subject, text_body=body ) except: # pylint: disable=bare-except logger.info( 'Failed emailing to student', exc_info=True, report_to_sentry=True, ) failed_receivers.append(receiver) if failed_receivers: raise p.exceptions.APIException( 'Failed to email {every} user'.format( every='every' if len(receivers) != len(failed_receivers) else 'any' ), 'Failed to mail some users', p.exceptions.APICodes.MAILING_FAILED, 400, all_users=receivers, failed_users=failed_receivers, )
def maybe_kill_unneeded_runner(runner_hex_id: str) -> None: """Maybe kill the given runner if it is still unassigned. :param runner_hex_id: The hex id of the runner to maybe kill. """ # One interesting thing to consider here is how to deal with the # minimum amount of extra runners setting here. You could say that if # killing this runner brings us under this setting we shouldn't kill # the runner. However, that might mean that runners will live for very # long, and that might have unforeseen complications. So we do want to # kill runners after some time, but that might mean we kill every # extra runner at the same time, which would mean that for a minute or # so we do not have enough runners at all. To prevent that we have a # random element to when runners are killed. # TODO: If a runner goes from unassigned, to assigned, to unassigned we # might kill it too soon. As we start a job to kill it for the first time # it is unassigned, and this job might be executed while it is unassigned # for the second time. runner_id = uuid.UUID(hex=runner_hex_id) runner = db.session.query( models.Runner).filter_by(id=runner_id).with_for_update().one_or_none() with bound_to_logger(runner=runner): if runner is None: logger.warning('Runner was deleted') return should_run_date = runner.created_at + datetime.timedelta(minutes=( app.config['RUNNER_MAX_TIME_ALIVE'] + random.randint(0, 15))) if not runner.is_before_run: logger.info('Runner has already finished running', runner_state=runner.state) return elif utils.maybe_delay_current_task(should_run_date): return elif runner.job_id is not None: logger.info('Runner is needed', job_of_runner=runner.job_id) return logger.info('Runner is still not doing anything') runner.kill(maybe_start_new=True, shutdown_only=False)
def start_runner(self) -> None: assert self.instance_id is None client = boto3.client('ec2') ec2 = boto3.resource('ec2') images = client.describe_images( Owners=['self'], Filters=[{ 'Name': 'tag:AutoTest', 'Values': [app.config['AWS_TAG_VALUE']] }], ).get('Images', []) assert images image = max(images, key=lambda image: image['CreationDate']) logger.info('Creating AWS instance', instance_type=app.config['AWS_INSTANCE_TYPE'], image_id=image['ImageId']) inst, = ec2.create_instances( ImageId=image['ImageId'], InstanceType=app.config['AWS_INSTANCE_TYPE'], MaxCount=1, MinCount=1) with bound_to_logger(instance=inst): logger.info('Started AWS instance, waiting for network') self.instance_id = inst.id for _ in range(120): inst = ec2.Instance(inst.id) if inst.public_ip_address: logger.info('AWS instance up') self.ipaddr = inst.public_ip_address return time.sleep(1) logger.error('Timed out waiting on AWS instance') raise TimeoutError
def cleanup_runner(self, shutdown_only: bool) -> None: username = app.config['_TRANSIP_USERNAME'] key_file = app.config['_TRANSIP_PRIVATE_KEY_FILE'] vps_service = transip.service.VpsService(username, private_key_file=key_file) vps, = [ vps for vps in vps_service.get_vpses() if vps['ipAddress'] == self.ipaddr ] assert vps['name'] == self.instance_id with bound_to_logger(vps=vps): vps_name = vps['name'] snapshots = vps_service.get_snapshots_by_vps(vps_name) snapshot = min(snapshots, key=lambda s: s['dateTimeCreate']) self._retry_vps_action('stopping vps', lambda: vps_service.stop(vps_name)) self._retry_vps_action( 'reverting snapshot', lambda: vps_service.revert_snapshot(vps_name, snapshot['name'] ), )
def do_second_step_of_lti_launch(self) -> LTILaunchData: """Do the second step of the LTI launch. """ with bound_to_logger() as bind_to_logger: user, new_token, updated_email = self.ensure_lti_user() auth.set_current_user(user) course = self.get_course() bind_to_logger(course=course) assert course.is_lti, 'Returned course is not an LTI course' assig = self.get_assignment(user, course) bind_to_logger(assignment=assig) assert assig.course_id == course.id, ( 'The returned assignment is not in the earlier returned course' ) if assig.visibility_state.is_deleted: raise APIException( 'The launched assignment has been deleted on CodeGrade.', f'The assignment "{assig.id}" has been deleted', APICodes.OBJECT_NOT_FOUND, 404) self.set_user_role(user) new_role_created = self.set_user_course_role(user, course) models.db.session.commit() auth.AssignmentPermissions(assig).ensure_may_see() return LTILaunchData( assignment=assig, course=course, new_role_created=new_role_created, custom_lms_name=self.get_lms_name(), updated_email=updated_email, access_token=new_token, type='normal_result', )
def _execute( cls, container: 'auto_test_module.StartedContainer', opts: 'auto_test_module.ExecuteOptions', ) -> float: def now() -> str: return DatetimeWithTimezone.utcnow().isoformat() data = opts.test_instructions['data'] assert isinstance(data, dict) inputs = t.cast(t.List[dict], data['inputs']) default_result = { 'state': AutoTestStepResultState.not_started.name, 'created_at': now(), } test_result: t.Dict[str, t.Any] = { 'steps': [copy.deepcopy(default_result) for _ in inputs] } opts.update_test_result(AutoTestStepResultState.running, test_result) prog = t.cast(str, data['program']) time_limit = opts.test_instructions['command_time_limit'] total_state = AutoTestStepResultState.failed total_weight = 0 for idx, step in enumerate(inputs): test_result['steps'][idx].update({ 'state': AutoTestStepResultState.running.name, 'started_at': now(), }) opts.update_test_result(AutoTestStepResultState.running, test_result) state: t.Optional[AutoTestStepResultState] try: res = container.run_student_command( f'{prog} {step["args"]}', time_limit, stdin=step['stdin'].encode('utf-8')) except psef.auto_test.CommandTimeoutException as e: code = e.EXIT_CODE stderr = e.stderr stdout = e.stdout time_spend = e.time_spend state = AutoTestStepResultState.timed_out else: code = res.exit_code stdout = res.stdout stderr = res.stderr time_spend = res.time_spend state = None if code == 0: step_options = t.cast(t.List[str], step['options']) expected_output = step['output'].rstrip('\n') with cg_logger.bound_to_logger(step=step): success, new_code = cls.match_output( stdout, expected_output, step_options) code = code if new_code is None else new_code else: success = False achieved_points = 0 if success: total_state = AutoTestStepResultState.passed state = AutoTestStepResultState.passed total_weight += step['weight'] achieved_points = step['weight'] else: state = state or AutoTestStepResultState.failed test_result['steps'][idx].update({ 'stdout': stdout, 'stderr': stderr, 'state': state.name, 'exit_code': code, 'time_spend': time_spend, 'achieved_points': achieved_points, 'started_at': None, }) opts.update_test_result(AutoTestStepResultState.running, test_result) if state == AutoTestStepResultState.timed_out: opts.yield_core() opts.update_test_result(total_state, test_result) return total_weight