def get_training_event_file_path(team_uuid, model_uuid): client = util.storage_client() folder = __get_model_folder(team_uuid, model_uuid) prefix = '%s/events.out.tfevents.' % folder for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix): return folder, __get_path(blob.name), blob.updated return None, None, None
def store_event_summary_image(team_uuid, model_uuid, folder, step, tag, encoded_image_string): blob_name = '%s/step_%d_%s' % (folder, step, tag.replace('/', '_')) bucket = util.storage_client().bucket(BUCKET_BLOBS) blob = bucket.blob(blob_name) if not blob.exists(): __write_string_to_blob(blob_name, encoded_image_string, 'image/png')
def prepare_to_upload_video(team_uuid, video_uuid, content_type): video_blob_name = 'video_files/%s/%s' % (team_uuid, video_uuid) blob = util.storage_client().bucket(BUCKET_BLOBS).blob(video_blob_name) expires_at_datetime = datetime.now() + timedelta(minutes=5) signed_url = blob.generate_signed_url(expires_at_datetime, method='PUT', content_type=content_type) return video_blob_name, signed_url
def get_image_urls(image_blob_names): bucket = util.storage_client().bucket(BUCKET_BLOBS) expires_at_datetime = datetime.now() + timedelta(minutes=10) signed_urls = [] for image_blob_name in image_blob_names: blob = bucket.blob(image_blob_name) signed_urls.append( blob.generate_signed_url(expires_at_datetime, method='GET')) return signed_urls
def perform_action_from_blob(action_parameters_blob_name, time_limit): blob = util.storage_client().get_bucket(BUCKET_ACTION_PARAMETERS).blob( action_parameters_blob_name) # If the blob no longer exists, this event is a duplicate and is ignored. if blob.exists(): action_parameters_json = blob.download_as_string() blob.delete() action_parameters = json.loads(action_parameters_json) perform_action(action_parameters, time_limit)
def set_cors_policy_for_put(): bucket = util.storage_client().bucket(BUCKET_BLOBS) policies = bucket.cors if len(policies) == 0: policies.append({'origin': [constants.ORIGIN]}) policies[0]['responseHeader'] = ['Content-Type'] policies[0]['method'] = ['PUT'] policies[0]['maxAgeSeconds'] = 3600 bucket.cors = policies bucket.update()
def __write_string_to_blob(blob_name, s, content_type): blob = util.storage_client().bucket(BUCKET_BLOBS).blob(blob_name) # Retry up to 5 times. retry = 0 while True: try: blob.upload_from_string(s, content_type=content_type) return except: if retry < 5: retry += 1 else: raise
def trigger_action_via_blob(action_parameters): # Copy the given action_parameters and remove the action_time_limit entry from the copy action_parameters_copy = action_parameters.copy() action_parameters_copy.pop(ACTION_TIME_LIMIT, None) action_parameters_copy.pop(ACTION_RETRIGGERED, None) # Write the copied action_parameters to trigger the cloud function. action_parameters_blob_name = '%s/%s' % ( action_parameters_copy[ACTION_NAME], str(uuid.uuid4().hex)) action_parameters_json = json.dumps(action_parameters_copy) blob = util.storage_client().bucket(BUCKET_ACTION_PARAMETERS).blob( action_parameters_blob_name) util.log('action.trigger_action_via_blob - %s' % action_parameters_copy[ACTION_NAME]) blob.upload_from_string(action_parameters_json, content_type="text/json")
def __validate_team_info(program, team_number, team_code): bucket = util.storage_client().get_bucket(BUCKET_BLOBS) teams = bucket.blob('team_info/teams').download_as_string().decode('utf-8') for line in teams.split('\n'): line = line.strip() if line == "": continue tokens = line.split(',') if program == tokens[0].strip(): if team_number == tokens[1].strip(): return team_code == tokens[2].strip() logging.critical( "__validate_team_info incorrect login program='%s' team_number='%s' team_code='%s'" % (program, team_number, team_code)) return False
def get_trained_checkpoint_path(team_uuid, model_uuid): client = util.storage_client() # We're looking for a file like this: model.ckpt-2000.index prefix = '%s/model.ckpt-' % __get_model_folder(team_uuid, model_uuid) pattern = re.compile(r'%s(.*)\.index' % prefix) max_number = None for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix): match = pattern.match(blob.name) if match is not None: n = int(match.group(1)) if max_number is None or n > max_number: max_number = n if max_number is not None: return __get_path('%s%d' % (prefix, max_number)) return ''
def delete_model_blobs(team_uuid, model_uuid, action_parameters): client = util.storage_client() prefix = '%s/' % __get_model_folder(team_uuid, model_uuid) for blob in client.list_blobs(BUCKET_BLOBS, prefix=prefix): __delete_blob(blob.name) action.retrigger_if_necessary(action_parameters)
def __write_blob_to_file(blob_name, filename): blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name) if blob.exists(): blob.download_to_filename(filename) return True return False
def __delete_blob(blob_name): blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name) if blob.exists(): blob.delete() return True return False
def __retrieve_blob(blob_name): blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob(blob_name) return blob.download_as_string()
def __delete_blobs(blob_names): # Ignore 404 errors on delete. bucket = util.storage_client().get_bucket(BUCKET_BLOBS) bucket.delete_blobs(blob_names, on_error=lambda blob: None)
def tflite_graph_pb_exists(team_uuid, model_uuid): tflite_graph_pb_blob_name = '%s/tflite_graph.pb' % __get_tflite_folder( team_uuid, model_uuid) blob = util.storage_client().get_bucket(BUCKET_BLOBS).blob( tflite_graph_pb_blob_name) return blob.exists()
def start_training_model(team_uuid, description, dataset_uuids_json, starting_model, max_running_minutes, num_training_steps, create_time_ms): # Call retrieve_model_list to update all models (which may have finished training) and update # the team_entity. model_entities = retrieve_model_list(team_uuid) found_starting_model = False for starting_model_name, starting_model_checkpoint in STARTING_MODELS.items( ): if starting_model == starting_model_name: found_starting_model = True starting_model_uuid = None starting_model_entity = None user_visible_starting_model = starting_model original_starting_model = starting_model fine_tune_checkpoint = 'gs://%s/static/training/models/%s/model.ckpt' % ( BUCKET, starting_model_checkpoint) break if not found_starting_model: # starting_model is the model_uuid of one of the user's own models. starting_model_uuid = starting_model starting_model_entity = retrieve_model_entity(team_uuid, starting_model_uuid) if starting_model_entity['trained_checkpoint_path'] == '': message = 'Error: Trained checkpoint not found for model_uuid=%s.' % starting_model_uuid logging.critical(message) raise exceptions.HttpErrorNotFound(message) # user_visible_starting_model is the description of that model. user_visible_starting_model = starting_model_entity['description'] original_starting_model = starting_model_entity[ 'original_starting_model'] fine_tune_checkpoint = starting_model_entity['trained_checkpoint_path'] # storage.model_trainer_starting will raise an exception if the team doesn't have enough # training time left. model_uuid = storage.model_trainer_starting(team_uuid, max_running_minutes) try: object_detection_tar_gz = 'gs://%s/static/training/object_detection-0.1.tar.gz' % BUCKET slim_tar_gz = 'gs://%s/static/training/slim-0.1.tar.gz' % BUCKET pycocotools_tar_gz = 'gs://%s/static/training/pycocotools-2.0.tar.gz' % BUCKET dataset_uuid_list = json.loads(dataset_uuids_json) dataset_entities = storage.retrieve_dataset_entities( team_uuid, dataset_uuid_list) if len(dataset_entities) != len(dataset_uuid_list): message = 'Error: One or more datasets not found for dataset_uuids=%s.' % dataset_uuids_json logging.critical(message) raise exceptions.HttpErrorNotFound(message) previous_training_steps = 0 dataset_uuids = [] train_input_path = [] eval_input_path = [] train_frame_count = 0 eval_frame_count = 0 train_negative_frame_count = 0 eval_negative_frame_count = 0 train_dict_label_to_count = {} eval_dict_label_to_count = {} sorted_label_list = None label_map_path = None if starting_model_entity is not None: previous_training_steps += starting_model_entity[ 'previous_training_steps'] dataset_uuids.extend(starting_model_entity['dataset_uuids']) train_input_path.extend(starting_model_entity['train_input_path']) eval_input_path.extend(starting_model_entity['eval_input_path']) train_frame_count += starting_model_entity['train_frame_count'] eval_frame_count += starting_model_entity['eval_frame_count'] train_negative_frame_count += starting_model_entity[ 'train_negative_frame_count'] eval_negative_frame_count += starting_model_entity[ 'eval_negative_frame_count'] util.extend_dict_label_to_count( train_dict_label_to_count, starting_model_entity['train_dict_label_to_count']) util.extend_dict_label_to_count( eval_dict_label_to_count, starting_model_entity['eval_dict_label_to_count']) sorted_label_list = starting_model_entity['sorted_label_list'] label_map_path = starting_model_entity['label_map_path'] for dataset_entity in dataset_entities: dataset_uuids.append(dataset_entity['dataset_uuid']) train_input_path.append(dataset_entity['train_input_path']) eval_input_path.append(dataset_entity['eval_input_path']) train_frame_count += dataset_entity['train_frame_count'] eval_frame_count += dataset_entity['eval_frame_count'] train_negative_frame_count += dataset_entity[ 'train_negative_frame_count'] eval_negative_frame_count += dataset_entity[ 'eval_negative_frame_count'] util.extend_dict_label_to_count( train_dict_label_to_count, dataset_entity['train_dict_label_to_count']) util.extend_dict_label_to_count( eval_dict_label_to_count, dataset_entity['eval_dict_label_to_count']) if sorted_label_list is None: sorted_label_list = dataset_entity['sorted_label_list'] label_map_path = dataset_entity['label_map_path'] elif sorted_label_list != dataset_entity['sorted_label_list']: message = "Error: The datasets contain different labels and cannot be used together." logging.critical(message) raise exceptions.HttpErrorBadRequest(message) # Create the pipeline.config file and store it in cloud storage. bucket = util.storage_client().get_bucket(BUCKET) config_template_blob_name = 'static/training/models/configs/%s.config' % original_starting_model quantization_delay = max(0, num_training_steps - 200) pipeline_config = (bucket.blob( config_template_blob_name ).download_as_string().decode('utf-8').replace( 'TO_BE_CONFIGURED/num_classes', str(len(sorted_label_list))).replace( 'TO_BE_CONFIGURED/fine_tune_checkpoint', fine_tune_checkpoint).replace( 'TO_BE_CONFIGURED/train_input_path', json.dumps(train_input_path)).replace( 'TO_BE_CONFIGURED/label_map_path', label_map_path).replace( 'TO_BE_CONFIGURED/eval_input_path', json.dumps(eval_input_path)).replace( 'TO_BE_CONFIGURED/num_examples', str(eval_frame_count)).replace( 'TO_BE_CONFIGURED/num_training_steps', str(num_training_steps)).replace( 'TO_BE_CONFIGURED/quantization_delay', str(quantization_delay))) pipeline_config_path = blob_storage.store_pipeline_config( team_uuid, model_uuid, pipeline_config) model_dir = blob_storage.get_model_folder_path(team_uuid, model_uuid) job_dir = model_dir checkpoint_dir = model_dir ml = __get_ml_service() parent = __get_parent() train_job_id = __get_train_job_id(model_uuid) scheduling = { 'maxRunningTime': '%ds' % (max_running_minutes * 60), } train_training_input = { 'scaleTier': 'BASIC_TPU', 'packageUris': [ object_detection_tar_gz, slim_tar_gz, pycocotools_tar_gz, ], 'pythonModule': 'object_detection.model_tpu_main', 'args': [ '--model_dir', model_dir, '--pipeline_config_path', pipeline_config_path, '--num_train_steps', str(num_training_steps), # Note(lizloone) I commented out the tpu_zone argument after jobs were failing on # July 10, 2020. I found documentation at # https://cloud.google.com/ai-platform/training/docs/using-tpus#connecting_to_the_tpu_grpc_server # that says "However, you must make one important change when you use # TPUClusterResolver for code that runs on AI Platform Training: Do not provide any # arguments when you construct the TPUClusterResolver instance. When the tpu, zone, # and project keyword arguments are all set to their default value of None, AI # Platform Training automatically provides the cluster resolver with the necessary # connection details through environment variables." #'--tpu_zone', 'us-central1', ], # TODO(lizlooney): Specify hyperparameters. #'hyperparameters': { # object (HyperparameterSpec) #}, 'region': 'us-central1', # Don't hardcode? 'jobDir': job_dir, 'runtimeVersion': '1.15', 'pythonVersion': '3.7', 'scheduling': scheduling, } train_job = { 'jobId': train_job_id, 'trainingInput': train_training_input, } train_job_response = ml.projects().jobs().create( parent=parent, body=train_job).execute() except: util.log( 'model_trainer.start_training_model - creating training job - except %s' % traceback.format_exc().replace('\n', ' ... ')) # storage.failed_to_start_training will adjust the team's remaining training time. storage.model_trainer_failed_to_start(team_uuid, model_uuid, max_running_minutes) raise try: if eval_frame_count > 0: eval_job_id = __get_eval_job_id(model_uuid) eval_training_input = { 'scaleTier': 'BASIC_GPU', 'packageUris': [ object_detection_tar_gz, slim_tar_gz, pycocotools_tar_gz, ], 'pythonModule': 'object_detection.model_main', 'args': [ '--model_dir', model_dir, '--pipeline_config_path', pipeline_config_path, '--checkpoint_dir', checkpoint_dir, ], 'region': 'us-central1', 'jobDir': job_dir, 'runtimeVersion': '1.15', 'pythonVersion': '3.7', } eval_job = { 'jobId': eval_job_id, 'trainingInput': eval_training_input, } eval_job_response = ml.projects().jobs().create( parent=parent, body=eval_job).execute() else: eval_job_response = None except: util.log( 'model_trainer.start_training_model - creating eval job - except %s' % traceback.format_exc().replace('\n', ' ... ')) # storage.model_trainer_failed_to_start will adjust the team's remaining training time. storage.model_trainer_failed_to_start(team_uuid, model_uuid, max_running_minutes) # Cancel the training job. ml.projects().jobs().cancel( name=__get_train_job_name(model_uuid)).execute() raise model_entity = storage.model_trainer_started( team_uuid, model_uuid, description, dataset_uuids, create_time_ms, max_running_minutes, num_training_steps, previous_training_steps, starting_model, user_visible_starting_model, original_starting_model, fine_tune_checkpoint, sorted_label_list, label_map_path, train_input_path, eval_input_path, train_frame_count, eval_frame_count, train_negative_frame_count, eval_negative_frame_count, train_dict_label_to_count, eval_dict_label_to_count, train_job_response, eval_job_response) return model_entity
def __get_download_url(blob_name): blob = util.storage_client().bucket(BUCKET_BLOBS).blob(blob_name) if not blob.exists(): return False, '' expires_at_datetime = datetime.now() + timedelta(minutes=10) return True, blob.generate_signed_url(expires_at_datetime, method='GET')