def load_from_store(self, expected_checkpoint_number=-1): try: s3_client = self._get_client() base_checkpoint_dir = self.params.base_checkpoint_dir for agent_key, bucket in self.params.buckets.items(): checkpoint_dir = base_checkpoint_dir if len(self.graph_manager.agents_params) == 1 else os.path.join(base_checkpoint_dir, agent_key) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) while True: s3_client = self._get_client() state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir)) # wait until lock is removed response = s3_client.list_objects_v2(Bucket=bucket, Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key)) if "Contents" not in response or self.ignore_lock: try: checkpoint_file_path = os.path.abspath(os.path.join(checkpoint_dir, state_file.path)) # fetch checkpoint state file from S3 s3_client.download_file(Bucket=bucket, Key=self._get_s3_key(state_file.filename, agent_key), Filename=checkpoint_file_path) except botocore.exceptions.ClientError: if self.ignore_lock: log_and_exit("Checkpoint not found", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue except Exception: if self.ignore_lock: log_and_exit("Checkpoint not found", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue else: time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue # check if there's a Finished file response = s3_client.list_objects_v2(Bucket=bucket, Prefix=self._get_s3_key(SyncFiles.FINISHED.value, agent_key)) if "Contents" in response: try: finished_file_path = os.path.abspath(os.path.join(checkpoint_dir, SyncFiles.FINISHED.value)) s3_client.download_file(Bucket=bucket, Key=self._get_s3_key(SyncFiles.FINISHED.value, agent_key), Filename=finished_file_path) except Exception: pass # check if there's a Ready file response = s3_client.list_objects_v2(Bucket=bucket, Prefix=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key)) if "Contents" in response: try: ready_file_path = os.path.abspath(os.path.join(checkpoint_dir, SyncFiles.TRAINER_READY.value)) s3_client.download_file(Bucket=bucket, Key=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key), Filename=ready_file_path) except Exception: pass checkpoint_state = state_file.read() if checkpoint_state is not None: # if we get a checkpoint that is older that the expected checkpoint, we wait for # the new checkpoint to arrive. if checkpoint_state.num < expected_checkpoint_number: time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue response = s3_client.list_objects_v2(Bucket=bucket, Prefix=self._get_s3_key("", agent_key)) if "Contents" in response: # Check to see if the desired checkpoint is in the bucket has_chkpnt = any(list(map(lambda obj: os.path.split(obj['Key'])[1].\ startswith(checkpoint_state.name), response['Contents']))) for obj in response["Contents"]: full_key_prefix = os.path.normpath(self.key_prefixes[agent_key]) + "/" filename = os.path.abspath(os.path.join(checkpoint_dir, obj["Key"].\ replace(full_key_prefix, ""))) dirname, basename = os.path.split(filename) # Download all the checkpoints but not the frozen models since they # are not necessary _, file_extension = os.path.splitext(obj["Key"]) if file_extension != '.pb' \ and (basename.startswith(checkpoint_state.name) or not has_chkpnt): if not os.path.exists(dirname): os.makedirs(dirname) s3_client.download_file(Bucket=bucket, Key=obj["Key"], Filename=filename) # Change the coach checkpoint file to point to the latest available checkpoint, # also log that we are changing the checkpoint. if not has_chkpnt: all_ckpnts = _filter_checkpoint_files(os.listdir(checkpoint_dir)) if all_ckpnts: LOG.info("%s not in s3 bucket, downloading all checkpoints \ and using %s", checkpoint_state.name, all_ckpnts[-1]) state_file.write(all_ckpnts[-1]) else: log_and_exit("No checkpoint files", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) break return True except botocore.exceptions.ClientError: log_and_exit("Unable to download checkpoint", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception: log_and_exit("Unable to download checkpoint", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def load_from_store(self, expected_checkpoint_number=-1): try: if not os.path.exists(self.params.checkpoint_dir): os.makedirs(self.params.checkpoint_dir) while True: s3_client = self._get_client() state_file = CheckpointStateFile(os.path.abspath(self.params.checkpoint_dir)) # wait until lock is removed response = s3_client.list_objects_v2(Bucket=self.params.bucket, Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value)) if "Contents" not in response: try: # fetch checkpoint state file from S3 s3_client.download_file(Bucket=self.params.bucket, Key=self._get_s3_key(state_file.filename), Filename=state_file.path) except Exception as e: time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue else: time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue # check if there's a Finished file response = s3_client.list_objects_v2(Bucket=self.params.bucket, Prefix=self._get_s3_key(SyncFiles.FINISHED.value)) if "Contents" in response: try: finished_file_path = os.path.abspath(os.path.join(self.params.checkpoint_dir, SyncFiles.FINISHED.value)) s3_client.download_file(Bucket=self.params.bucket, Key=self._get_s3_key(SyncFiles.FINISHED.value), Filename=finished_file_path) except Exception as e: pass # check if there's a Ready file response = s3_client.list_objects_v2(Bucket=self.params.bucket, Prefix=self._get_s3_key(SyncFiles.TRAINER_READY.value)) if "Contents" in response: try: ready_file_path = os.path.abspath(os.path.join(self.params.checkpoint_dir, SyncFiles.TRAINER_READY.value)) s3_client.download_file(Bucket=self.params.bucket, Key=self._get_s3_key(SyncFiles.TRAINER_READY.value), Filename=ready_file_path) except Exception as e: pass checkpoint_state = state_file.read() if checkpoint_state is not None: # if we get a checkpoint that is older that the expected checkpoint, we wait for # the new checkpoint to arrive. if checkpoint_state.num < expected_checkpoint_number: time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) continue response = s3_client.list_objects_v2(Bucket=self.params.bucket, Prefix=self._get_s3_key("")) if "Contents" in response: # Check to see if the desired checkpoint is in the bucket has_chkpnt = any(list(map(lambda obj: os.path.split(obj['Key'])[1].\ startswith(checkpoint_state.name), response['Contents']))) for obj in response["Contents"]: full_key_prefix = os.path.normpath(self.key_prefix) + "/" filename = os.path.abspath(os.path.join(self.params.checkpoint_dir, obj["Key"].\ replace(full_key_prefix, ""))) dirname, basename = os.path.split(filename) # Download all the checkpoints but not the frozen models since they # are not necessary _, file_extension = os.path.splitext(obj["Key"]) if file_extension != '.pb' \ and (basename.startswith(checkpoint_state.name) or not has_chkpnt): if not os.path.exists(dirname): os.makedirs(dirname) s3_client.download_file(Bucket=self.params.bucket, Key=obj["Key"], Filename=filename) # Change the coach checkpoint file to point to the latest available checkpoint, # also log that we are changing the checkpoint. if not has_chkpnt: all_ckpnts = _filter_checkpoint_files(os.listdir(self.params.checkpoint_dir)) if all_ckpnts: logger.info("%s not in s3 bucket, downloading all checkpoints \ and using %s", checkpoint_state.name, all_ckpnts[-1]) state_file.write(all_ckpnts[-1]) else: utils.json_format_logger("No checkpoint files found in {}".format(self.params.bucket), **utils.build_user_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_400)) utils.simapp_exit_gracefully() return True except botocore.exceptions.ClientError as e: utils.json_format_logger("Unable to download checkpoint from {}, {}" .format(self.params.bucket, e.response['Error']['Code']), **utils.build_user_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_400)) utils.simapp_exit_gracefully() except Exception as e: utils.json_format_logger("Unable to download checkpoint from {}, {}" .format(self.params.bucket, e), **utils.build_system_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500)) utils.simapp_exit_gracefully()