Exemplo n.º 1
0
 def confirm_evaluation(self, job) -> bool:
     if in_test():
         status = JOB_STATUS_CREATED
         ret = True
     elif dbox(job).confirmed:
         log.info(f'Job already confirmed ' f'{box2json(job)}')
         status = JOB_STATUS_CREATED
         ret = True
     else:
         url = f'{job.botleague_liaison_host}/confirm'
         json = {'eval_key': job.eval_spec.eval_key}
         log.info(f'Confirming eval {json} at {url}...')
         confirmation = requests.post(url, json=json)
         if 400 <= confirmation.status_code < 500:
             status = JOB_STATUS_DENIED_CONFIRMATION
             log.error('Botleague denied confirmation of job, skipping')
             ret = False
         elif not confirmation.ok:
             status = JOB_STATUS_CREATED
             log.error('Unable to confirm job with botleague liaison, '
                       'will try again shortly')
             ret = False
         else:
             status = JOB_STATUS_CREATED
             log.success(f'Confirmed eval job ' f'{box2json(job)} at {url}')
             ret = True
     job.status = status
     job.confirmed = ret
     self.save_job(job)
     return ret
Exemplo n.º 2
0
def add_stackdriver_sink(loguru_logger, log_name):
    """Google cloud log sink in "Global" i.e.
    https://console.cloud.google.com/logs/viewer?project=silken-impulse-217423&minLogLevel=0&expandAll=false&resource=global
    """
    global stackdriver_client
    if not in_test() and stackdriver_client is None and \
            not blconfig.disable_cloud_log_sinks:
        stackdriver_client = gcloud_logging.Client()
        stackdriver_logger = stackdriver_client.logger(log_name)

    def sink(message):
        record = message.record
        level = str(record['level'])
        if level == 'SUCCESS':
            severity = 'NOTICE'
        elif level == 'TRACE':
            # Nothing lower than DEBUG in stackdriver
            severity = 'DEBUG'
        elif level == 'EXCEPTION':
            severity = 'ERROR'
        elif level in VALID_STACK_DRIVER_LEVELS:
            severity = level
        else:
            severity = 'INFO'
        if not in_test():
            stackdriver_logger.log_text(message, severity=severity)

    loguru_logger.add(sink)
Exemplo n.º 3
0
 def start_instance(self, inst):
     if in_test():
         log.warning('Not starting instance in test')
     else:
         op = self.gce.instances().start(project=self.project,
                                         zone=self.zone,
                                         instance=inst.name).execute()
         self.gce_ops_in_progress.append(op)
Exemplo n.º 4
0
def add_slack_error_sink(loguru_logger, channel: str, log_name: str = ''):
    if 'TEST_ALERTS' not in os.environ and (in_test() or
                                            blconfig.disable_cloud_log_sinks):
        loguru_logger.info('Not adding slack notifier')
        return

    client = slack.WebClient(token=decrypt_db_key('SLACK_ERROR_BOT_TOKEN'))

    msg_hashes = defaultdict(SlackMsgHash)

    def sink(message):
        import hashlib
        level = str(message.record['level'])

        def send_message():
            # Basic data types in closure are immutable
            msg_copy = copy(message)
            if len(msg_copy) > 1000:
                msg_copy = upload_to_gcs(msg_copy)
            else:
                msg_copy = f'```{msg_copy}```'
            message_plus_count = f'{msg_copy}\n' \
                f'Message duplicates in this process ' \
                f'{msg_hashes[msg_hash].count}'
            if log_name:
                message_plus_count = f'*{log_name}*\n{message_plus_count}'
            response = client.chat_postMessage(
                channel=channel,
                text=message_plus_count,
            )
            msg_hashes[msg_hash].last_notified = time.time()
            # assert response["ok"]
            # assert response["message"]["text"] == message

        def upload_to_gcs(msg_copy):
            log_time = message.record['time'].isoformat().replace(':', '')
            rando = utils.generate_rand_alphanumeric(10)
            log_url = upload.upload_str(name=f'{log_time}_{rando}.txt',
                                        content=msg_copy,
                                        bucket_name='deepdrive-alert-logs')
            # Truncate message for slack
            msg_copy = f'```{msg_copy[:500]}\n...\n{msg_copy[-500:]}```' \
                f'\nFull message: {log_url}'
            return msg_copy

        if level in ['ERROR', 'CRITICAL', 'ALERT', 'EMERGENCY']:
            text = message.record['message']
            msg_hash = hashlib.md5(text.encode()).hexdigest()
            if msg_hash in msg_hashes:
                last_notified = msg_hashes[msg_hash].last_notified
                if time.time() - last_notified > 60 * 5:
                    send_message()
            else:
                send_message()

            msg_hashes[msg_hash].count += 1

    loguru_logger.add(sink)
Exemplo n.º 5
0
 def create_instance(self, current_instances):
     if in_test():
         log.warning('Not creating instance in test')
         return None
     instance_name = self.get_next_instance_name(current_instances)
     config_path = os.path.join(ROOT, INSTANCE_CONFIG_PATH)
     config = Box.from_json(filename=config_path)
     # TODO: If job is CI, no GPU needed, but maybe more CPU
     config.name = instance_name
     config.disks[0].deviceName = instance_name
     create_op = Box(
         self.gce.instances().insert(project=self.project,
                                     zone=self.zone,
                                     body=config.to_dict()).execute())
     return create_op
Exemplo n.º 6
0
 def sink(message):
     record = message.record
     level = str(record['level'])
     if level == 'SUCCESS':
         severity = 'NOTICE'
     elif level == 'TRACE':
         # Nothing lower than DEBUG in stackdriver
         severity = 'DEBUG'
     elif level == 'EXCEPTION':
         severity = 'ERROR'
     elif level in VALID_STACK_DRIVER_LEVELS:
         severity = level
     else:
         severity = 'INFO'
     if not in_test():
         stackdriver_logger.log_text(message, severity=severity)
Exemplo n.º 7
0
def fetch_instance_id() -> Tuple[str, bool]:
    if in_test() or 'INSTANCE_ID' in os.environ:
        ret = os.environ['INSTANCE_ID']
        is_real = False
    else:
        try:
            ret = requests.get(f'{METADATA_URL}/id',
                               headers={
                                   'Metadata-Flavor': 'Google'
                               }).text
            log.success('INSTANCE_ID: ' + ret)
        except Exception as e:
            log.error('Unable to get GCP instance metadata. '
                      'Are you on GCP? If not, you can manually'
                      ' set the INSTANCE_ID'
                      ' in your env for testing purposes.')
            exit(1)
        is_real = True
    return ret, is_real
Exemplo n.º 8
0
    def stop_old_containers_if_running(self):
        containers = self.docker.containers.list()

        def is_botleague(_container):
            tags = _container.image.attrs['RepoTags']
            if tags:
                image_name = tags[0]
                if (image_name.startswith('deepdriveio/deepdrive:problem_')
                        or image_name.startswith('deepdriveio/deepdrive:bot_')
                        or image_name
                        == 'deepdriveio/private:deepdrive-sim-package' or
                        image_name == 'deepdriveio/ue4-deepdrive-deps:latest'):
                    return True
            return False

        for container in containers:
            if container.status == 'running' and is_botleague(container) and \
                    not in_test():
                container.stop()
    def exit(self):
        self.release_semaphore()
        if in_test():
            log.info('Not exiting in test')
            return
        elif self.caught_exception:
            log.error('Exiting due to caught exception')
            status = 100
        elif self.caught_sigint:
            log.warning('Exiting due to caught sigint')
            status = 101
        elif self.caught_sigterm:
            log.warning('Exiting due to caught sigterm')
            status = 102
        else:
            log.error('Unexpected reason for exit')
            status = 1

        log.warning(f'Exiting with status {status}')

        # http://tldp.org/LDP/abs/html/exitcodes.html
        sys.exit(status)
Exemplo n.º 10
0
 def send_results(job):
     if in_test():
         return
     else:
         try:
             log.info(f'Sending results for job \n' f'{box2json(job)}')
             # Nested custom retry to deal with 409's
             results_resp = post_results_with_retries(
                 url=f'{job.botleague_liaison_host}/results',
                 json=dict(eval_key=job.eval_spec.eval_key,
                           results=job.results))
             if not results_resp.ok:
                 log.error(f'Error posting results back to botleague: '
                           f'{results_resp}')
             else:
                 json_resp = results_resp.json()
                 log.success(
                     f'Successfully posted to botleague! response:\n'
                     f'{json.dumps(json_resp, indent=2)}')
         except Exception:
             # TODO: Create an alert on this log message
             log.exception('Possible problem sending results back to '
                           'liaison.')
Exemplo n.º 11
0
 def should_mock_github(self):
     return 'SHOULD_MOCK_GITHUB' in os.environ or in_test() or self.dry_run
Exemplo n.º 12
0
 def should_mock_gcs(self):
     return 'SHOULD_MOCK_GCS' in os.environ or in_test() or self.dry_run
Exemplo n.º 13
0
 def min_search_date(self):
     return '2019-05-07T19:47:27Z' if in_test() else \
         '2019-08-15T01:42:05Z'
Exemplo n.º 14
0
 def sleep_one_second(self):
     time.sleep(1) if not (in_test() or self.kill_now) else None
Exemplo n.º 15
0
def ping_cronitor(state):
    if in_test():
        return
    else:
        log.trace(f'Pinging cronitor with {state}')
        requests.get('https://cronitor.link/MJ8I4x/%s' % state, timeout=10)