示例#1
0
def get_training_event_file_path(team_uuid, model_uuid):
    client = util.storage_client()
    folder = __get_model_folder(team_uuid, model_uuid)
    prefix = '%s/events.out.tfevents.' % folder
    for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix):
        return folder, __get_path(blob.name), blob.updated
    return None, None, None
示例#2
0
def store_event_summary_image(team_uuid, model_uuid, folder, step, tag,
                              encoded_image_string):
    blob_name = '%s/step_%d_%s' % (folder, step, tag.replace('/', '_'))
    bucket = util.storage_client().bucket(BUCKET_BLOBS)
    blob = bucket.blob(blob_name)
    if not blob.exists():
        __write_string_to_blob(blob_name, encoded_image_string, 'image/png')
示例#3
0
def prepare_to_upload_video(team_uuid, video_uuid, content_type):
    video_blob_name = 'video_files/%s/%s' % (team_uuid, video_uuid)
    blob = util.storage_client().bucket(BUCKET_BLOBS).blob(video_blob_name)
    expires_at_datetime = datetime.now() + timedelta(minutes=5)
    signed_url = blob.generate_signed_url(expires_at_datetime,
                                          method='PUT',
                                          content_type=content_type)
    return video_blob_name, signed_url
示例#4
0
def get_image_urls(image_blob_names):
    bucket = util.storage_client().bucket(BUCKET_BLOBS)
    expires_at_datetime = datetime.now() + timedelta(minutes=10)
    signed_urls = []
    for image_blob_name in image_blob_names:
        blob = bucket.blob(image_blob_name)
        signed_urls.append(
            blob.generate_signed_url(expires_at_datetime, method='GET'))
    return signed_urls
示例#5
0
def perform_action_from_blob(action_parameters_blob_name, time_limit):
    blob = util.storage_client().get_bucket(BUCKET_ACTION_PARAMETERS).blob(
        action_parameters_blob_name)
    # If the blob no longer exists, this event is a duplicate and is ignored.
    if blob.exists():
        action_parameters_json = blob.download_as_string()
        blob.delete()
        action_parameters = json.loads(action_parameters_json)
        perform_action(action_parameters, time_limit)
示例#6
0
def set_cors_policy_for_put():
    bucket = util.storage_client().bucket(BUCKET_BLOBS)
    policies = bucket.cors
    if len(policies) == 0:
        policies.append({'origin': [constants.ORIGIN]})
        policies[0]['responseHeader'] = ['Content-Type']
        policies[0]['method'] = ['PUT']
        policies[0]['maxAgeSeconds'] = 3600
        bucket.cors = policies
        bucket.update()
示例#7
0
def __write_string_to_blob(blob_name, s, content_type):
    blob = util.storage_client().bucket(BUCKET_BLOBS).blob(blob_name)
    # Retry up to 5 times.
    retry = 0
    while True:
        try:
            blob.upload_from_string(s, content_type=content_type)
            return
        except:
            if retry < 5:
                retry += 1
            else:
                raise
示例#8
0
def trigger_action_via_blob(action_parameters):
    # Copy the given action_parameters and remove the action_time_limit entry from the copy
    action_parameters_copy = action_parameters.copy()
    action_parameters_copy.pop(ACTION_TIME_LIMIT, None)
    action_parameters_copy.pop(ACTION_RETRIGGERED, None)
    # Write the copied action_parameters to trigger the cloud function.
    action_parameters_blob_name = '%s/%s' % (
        action_parameters_copy[ACTION_NAME], str(uuid.uuid4().hex))
    action_parameters_json = json.dumps(action_parameters_copy)
    blob = util.storage_client().bucket(BUCKET_ACTION_PARAMETERS).blob(
        action_parameters_blob_name)
    util.log('action.trigger_action_via_blob - %s' %
             action_parameters_copy[ACTION_NAME])
    blob.upload_from_string(action_parameters_json, content_type="text/json")
示例#9
0
def __validate_team_info(program, team_number, team_code):
    bucket = util.storage_client().get_bucket(BUCKET_BLOBS)
    teams = bucket.blob('team_info/teams').download_as_string().decode('utf-8')
    for line in teams.split('\n'):
        line = line.strip()
        if line == "":
            continue
        tokens = line.split(',')
        if program == tokens[0].strip():
            if team_number == tokens[1].strip():
                return team_code == tokens[2].strip()
    logging.critical(
        "__validate_team_info incorrect login program='%s' team_number='%s' team_code='%s'"
        % (program, team_number, team_code))
    return False
示例#10
0
def get_trained_checkpoint_path(team_uuid, model_uuid):
    client = util.storage_client()
    # We're looking for a file like this: model.ckpt-2000.index
    prefix = '%s/model.ckpt-' % __get_model_folder(team_uuid, model_uuid)
    pattern = re.compile(r'%s(.*)\.index' % prefix)
    max_number = None
    for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix):
        match = pattern.match(blob.name)
        if match is not None:
            n = int(match.group(1))
            if max_number is None or n > max_number:
                max_number = n
    if max_number is not None:
        return __get_path('%s%d' % (prefix, max_number))
    return ''
示例#11
0
def delete_model_blobs(team_uuid, model_uuid, action_parameters):
    client = util.storage_client()
    prefix = '%s/' % __get_model_folder(team_uuid, model_uuid)
    for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix):
        __delete_blob(blob.name)
        action.retrigger_if_necessary(action_parameters)
示例#12
0
def __write_blob_to_file(blob_name, filename):
    blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name)
    if blob.exists():
        blob.download_to_filename(filename)
        return True
    return False
示例#13
0
def __delete_blob(blob_name):
    blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name)
    if blob.exists():
        blob.delete()
        return True
    return False
示例#14
0
def __retrieve_blob(blob_name):
    blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name)
    return blob.download_as_string()
示例#15
0
def __delete_blobs(blob_names):
    # Ignore 404 errors on delete.
    bucket = util.storage_client().get_bucket(BUCKET_BLOBS)
    bucket.delete_blobs(blob_names, on_error=lambda blob: None)
示例#16
0
def tflite_graph_pb_exists(team_uuid, model_uuid):
    tflite_graph_pb_blob_name = '%s/tflite_graph.pb' % __get_tflite_folder(
        team_uuid, model_uuid)
    blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(
        tflite_graph_pb_blob_name)
    return blob.exists()
示例#17
0
def start_training_model(team_uuid, description, dataset_uuids_json,
                         starting_model, max_running_minutes,
                         num_training_steps, create_time_ms):
    # Call retrieve_model_list to update all models (which may have finished training) and update
    # the team_entity.
    model_entities = retrieve_model_list(team_uuid)

    found_starting_model = False
    for starting_model_name, starting_model_checkpoint in STARTING_MODELS.items(
    ):
        if starting_model == starting_model_name:
            found_starting_model = True
            starting_model_uuid = None
            starting_model_entity = None
            user_visible_starting_model = starting_model
            original_starting_model = starting_model
            fine_tune_checkpoint = 'gs://%s/static/training/models/%s/model.ckpt' % (
                BUCKET, starting_model_checkpoint)
            break
    if not found_starting_model:
        # starting_model is the model_uuid of one of the user's own models.
        starting_model_uuid = starting_model
        starting_model_entity = retrieve_model_entity(team_uuid,
                                                      starting_model_uuid)
        if starting_model_entity['trained_checkpoint_path'] == '':
            message = 'Error: Trained checkpoint not found for model_uuid=%s.' % starting_model_uuid
            logging.critical(message)
            raise exceptions.HttpErrorNotFound(message)
        # user_visible_starting_model is the description of that model.
        user_visible_starting_model = starting_model_entity['description']
        original_starting_model = starting_model_entity[
            'original_starting_model']
        fine_tune_checkpoint = starting_model_entity['trained_checkpoint_path']

    # storage.model_trainer_starting will raise an exception if the team doesn't have enough
    # training time left.
    model_uuid = storage.model_trainer_starting(team_uuid, max_running_minutes)
    try:
        object_detection_tar_gz = 'gs://%s/static/training/object_detection-0.1.tar.gz' % BUCKET
        slim_tar_gz = 'gs://%s/static/training/slim-0.1.tar.gz' % BUCKET
        pycocotools_tar_gz = 'gs://%s/static/training/pycocotools-2.0.tar.gz' % BUCKET

        dataset_uuid_list = json.loads(dataset_uuids_json)
        dataset_entities = storage.retrieve_dataset_entities(
            team_uuid, dataset_uuid_list)
        if len(dataset_entities) != len(dataset_uuid_list):
            message = 'Error: One or more datasets not found for dataset_uuids=%s.' % dataset_uuids_json
            logging.critical(message)
            raise exceptions.HttpErrorNotFound(message)

        previous_training_steps = 0
        dataset_uuids = []
        train_input_path = []
        eval_input_path = []
        train_frame_count = 0
        eval_frame_count = 0
        train_negative_frame_count = 0
        eval_negative_frame_count = 0
        train_dict_label_to_count = {}
        eval_dict_label_to_count = {}
        sorted_label_list = None
        label_map_path = None
        if starting_model_entity is not None:
            previous_training_steps += starting_model_entity[
                'previous_training_steps']
            dataset_uuids.extend(starting_model_entity['dataset_uuids'])
            train_input_path.extend(starting_model_entity['train_input_path'])
            eval_input_path.extend(starting_model_entity['eval_input_path'])
            train_frame_count += starting_model_entity['train_frame_count']
            eval_frame_count += starting_model_entity['eval_frame_count']
            train_negative_frame_count += starting_model_entity[
                'train_negative_frame_count']
            eval_negative_frame_count += starting_model_entity[
                'eval_negative_frame_count']
            util.extend_dict_label_to_count(
                train_dict_label_to_count,
                starting_model_entity['train_dict_label_to_count'])
            util.extend_dict_label_to_count(
                eval_dict_label_to_count,
                starting_model_entity['eval_dict_label_to_count'])
            sorted_label_list = starting_model_entity['sorted_label_list']
            label_map_path = starting_model_entity['label_map_path']

        for dataset_entity in dataset_entities:
            dataset_uuids.append(dataset_entity['dataset_uuid'])
            train_input_path.append(dataset_entity['train_input_path'])
            eval_input_path.append(dataset_entity['eval_input_path'])
            train_frame_count += dataset_entity['train_frame_count']
            eval_frame_count += dataset_entity['eval_frame_count']
            train_negative_frame_count += dataset_entity[
                'train_negative_frame_count']
            eval_negative_frame_count += dataset_entity[
                'eval_negative_frame_count']
            util.extend_dict_label_to_count(
                train_dict_label_to_count,
                dataset_entity['train_dict_label_to_count'])
            util.extend_dict_label_to_count(
                eval_dict_label_to_count,
                dataset_entity['eval_dict_label_to_count'])
            if sorted_label_list is None:
                sorted_label_list = dataset_entity['sorted_label_list']
                label_map_path = dataset_entity['label_map_path']
            elif sorted_label_list != dataset_entity['sorted_label_list']:
                message = "Error: The datasets contain different labels and cannot be used together."
                logging.critical(message)
                raise exceptions.HttpErrorBadRequest(message)

        # Create the pipeline.config file and store it in cloud storage.
        bucket = util.storage_client().get_bucket(BUCKET)
        config_template_blob_name = 'static/training/models/configs/%s.config' % original_starting_model
        quantization_delay = max(0, num_training_steps - 200)
        pipeline_config = (bucket.blob(
            config_template_blob_name
        ).download_as_string().decode('utf-8').replace(
            'TO_BE_CONFIGURED/num_classes',
            str(len(sorted_label_list))).replace(
                'TO_BE_CONFIGURED/fine_tune_checkpoint',
                fine_tune_checkpoint).replace(
                    'TO_BE_CONFIGURED/train_input_path',
                    json.dumps(train_input_path)).replace(
                        'TO_BE_CONFIGURED/label_map_path',
                        label_map_path).replace(
                            'TO_BE_CONFIGURED/eval_input_path',
                            json.dumps(eval_input_path)).replace(
                                'TO_BE_CONFIGURED/num_examples',
                                str(eval_frame_count)).replace(
                                    'TO_BE_CONFIGURED/num_training_steps',
                                    str(num_training_steps)).replace(
                                        'TO_BE_CONFIGURED/quantization_delay',
                                        str(quantization_delay)))
        pipeline_config_path = blob_storage.store_pipeline_config(
            team_uuid, model_uuid, pipeline_config)

        model_dir = blob_storage.get_model_folder_path(team_uuid, model_uuid)
        job_dir = model_dir
        checkpoint_dir = model_dir

        ml = __get_ml_service()
        parent = __get_parent()
        train_job_id = __get_train_job_id(model_uuid)
        scheduling = {
            'maxRunningTime': '%ds' % (max_running_minutes * 60),
        }
        train_training_input = {
            'scaleTier':
            'BASIC_TPU',
            'packageUris': [
                object_detection_tar_gz,
                slim_tar_gz,
                pycocotools_tar_gz,
            ],
            'pythonModule':
            'object_detection.model_tpu_main',
            'args': [
                '--model_dir',
                model_dir,
                '--pipeline_config_path',
                pipeline_config_path,
                '--num_train_steps',
                str(num_training_steps),

                # Note(lizloone) I commented out the tpu_zone argument after jobs were failing on
                # July 10, 2020. I found documentation at
                # https://cloud.google.com/ai-platform/training/docs/using-tpus#connecting_to_the_tpu_grpc_server
                # that says "However, you must make one important change when you use
                # TPUClusterResolver for code that runs on AI Platform Training: Do not provide any
                # arguments when you construct the TPUClusterResolver instance. When the tpu, zone,
                # and project keyword arguments are all set to their default value of None, AI
                # Platform Training automatically provides the cluster resolver with the necessary
                # connection details through environment variables."
                #'--tpu_zone', 'us-central1',
            ],
            # TODO(lizlooney): Specify hyperparameters.
            #'hyperparameters': {
            #  object (HyperparameterSpec)
            #},
            'region':
            'us-central1',  # Don't hardcode?
            'jobDir':
            job_dir,
            'runtimeVersion':
            '1.15',
            'pythonVersion':
            '3.7',
            'scheduling':
            scheduling,
        }
        train_job = {
            'jobId': train_job_id,
            'trainingInput': train_training_input,
        }
        train_job_response = ml.projects().jobs().create(
            parent=parent, body=train_job).execute()
    except:
        util.log(
            'model_trainer.start_training_model - creating training job - except %s'
            % traceback.format_exc().replace('\n', ' ... '))
        # storage.failed_to_start_training will adjust the team's remaining training time.
        storage.model_trainer_failed_to_start(team_uuid, model_uuid,
                                              max_running_minutes)
        raise

    try:
        if eval_frame_count > 0:
            eval_job_id = __get_eval_job_id(model_uuid)
            eval_training_input = {
                'scaleTier':
                'BASIC_GPU',
                'packageUris': [
                    object_detection_tar_gz,
                    slim_tar_gz,
                    pycocotools_tar_gz,
                ],
                'pythonModule':
                'object_detection.model_main',
                'args': [
                    '--model_dir',
                    model_dir,
                    '--pipeline_config_path',
                    pipeline_config_path,
                    '--checkpoint_dir',
                    checkpoint_dir,
                ],
                'region':
                'us-central1',
                'jobDir':
                job_dir,
                'runtimeVersion':
                '1.15',
                'pythonVersion':
                '3.7',
            }
            eval_job = {
                'jobId': eval_job_id,
                'trainingInput': eval_training_input,
            }
            eval_job_response = ml.projects().jobs().create(
                parent=parent, body=eval_job).execute()
        else:
            eval_job_response = None
    except:
        util.log(
            'model_trainer.start_training_model - creating eval job - except %s'
            % traceback.format_exc().replace('\n', ' ... '))
        # storage.model_trainer_failed_to_start will adjust the team's remaining training time.
        storage.model_trainer_failed_to_start(team_uuid, model_uuid,
                                              max_running_minutes)
        # Cancel the training job.
        ml.projects().jobs().cancel(
            name=__get_train_job_name(model_uuid)).execute()
        raise
    model_entity = storage.model_trainer_started(
        team_uuid, model_uuid, description, dataset_uuids, create_time_ms,
        max_running_minutes, num_training_steps, previous_training_steps,
        starting_model, user_visible_starting_model, original_starting_model,
        fine_tune_checkpoint, sorted_label_list, label_map_path,
        train_input_path, eval_input_path, train_frame_count, eval_frame_count,
        train_negative_frame_count, eval_negative_frame_count,
        train_dict_label_to_count, eval_dict_label_to_count,
        train_job_response, eval_job_response)
    return model_entity
示例#18
0
def __get_download_url(blob_name):
    blob = util.storage_client().bucket(BUCKET_BLOBS).blob(blob_name)
    if not blob.exists():
        return False, ''
    expires_at_datetime = datetime.now() + timedelta(minutes=10)
    return True, blob.generate_signed_url(expires_at_datetime, method='GET')