def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='data/') args = parser.parse_args() data_dir = Path(args.data_dir) if not data_dir.exists(): (x_train, y_train), (x_test, y_test) = mnist.load_data() data_dir.mkdir() np.savez(str(data_dir / 'train'), image=x_train, label=y_train) np.savez(str(data_dir / 'test'), image=x_test, label=y_test) session = Session() s3_bucket_name = os.getenv('S3_BUCKET_NAME', session.default_bucket()) session.upload_data(path=str(data_dir), bucket=s3_bucket_name, key_prefix='dataset/mnist')
if __name__ == '__main__': local_output_path = "file://" local_data_path = os.getenv("DATA_PATH") project_name = "housing-price-prediction" job_name = project_name + time.strftime("-%Y-%m-%d-%H-%M", time.gmtime()) # Credentials role = os.getenv("AWS_SM_ROLE") aws_id = os.getenv("AWS_ID") region = os.getenv("AWS_REGION") image_uri = "{}.dkr.ecr.{}.amazonaws.com/aws-train".format(aws_id, region) print("Training image uri:{}".format(image_uri)) instance_type = os.getenv("AWS_DEFAULT_INSTANCE") bucket_name = os.getenv("AWS_BUCKET") # comment this out if you have a bucket already or use specific bucket # s3 = boto3.client('s3', region_name=region) # s3.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': region}) # upload the data to sagemaker boto_session = boto3.Session(region_name=region) sm_session = Session(boto_session=boto_session) data_uri = sm_session.upload_data(local_data_path, bucket=bucket_name, key_prefix='data', extra_args=None) print(data_uri) run(mode="sagemaker")
class Sagemaker: """ Class to provide AWS specific execution of the models. In the future, we can make a superclass that defines the basic methods (such as uploading data to the right folder/location, loading models etc). For now, we will only have AWS. This will be very similar to default session objects. """ training_instance_count = 1 training_instance_type = "ml.m4.xlarge" transformer_instance_count = 1 transformer_instance_type = "ml.c4.xlarge" deploy_instance_count = 1 deploy_instance_type = "ml.c4.xlarge" def __init__( self, bucket: Optional[str] = None, role: Optional[str] = None, prefix: Optional[str] = None, default_model_kwargs: Optional[Dict] = None, default_transfomer_kwargs: Optional[Dict] = None, default_deploy_kwargs: Optional[Dict] = None, ) -> None: """ Initializes the AWS object Arguments: bucket: The bucket name. Defaulted to the session default bucket role: The role name to assume. Default is getting from AWS_DEFAULT_ROLE of the env variables prefix: The prefix to use in the bucket. Defaulted to 'data' default_model_kwargs: Dict for default kwargs for any sagemaker model. Default contains train_instance_type, train_instance_count, role and session default_transformer_kwargs: Dict for default kwargs for any sagemaker transformer. Default contains instance_type, instance_count, and role. default_deploy_kwargs: Dict for default kwargs for any sagemaker deployment. Default contains instance_type and initial_instance_count. """ LOGGER.info("Initializing Sagemaker executor") self.boto_session = BotoSession( aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"), region_name="eu-west-1", ) self.region = self.boto_session.region_name self.session = Session(boto_session=self.boto_session) self.role = role if role is not None else os.environ.get( "AWS_DEFAULT_ROLE") self.bucket = bucket if bucket is not None else self.session.default_bucket( ) self.prefix = prefix if prefix is not None else "data" self.default_model_kwargs = self._default_model_kwargs( self.role, self.session, default_model_kwargs) self.default_transformer_kwargs = self._default_transformer_kwargs( self.role, self.session, default_transfomer_kwargs) self.default_deploy_kwargs = self._default_deploy_kwargs( self.role, self.session, default_deploy_kwargs) def _default_model_kwargs(self, role, session, input_default) -> Dict: initial = { "role": role, "sagemaker_session": session, "train_instance_count": self.training_instance_count, "train_instance_type": self.training_instance_type, } if input_default is not None: initial.update(input_default) return initial def _default_transformer_kwargs(self, role, session, input_default) -> Dict: initial = { "role": role, "instance_count": self.transformer_instance_count, "instance_type": self.transformer_instance_type, } if input_default is not None: initial.update(input_default) return initial def _default_deploy_kwargs(self, role, session, input_default) -> Dict: initial = { "initial_instance_count": self.deploy_instance_count, "instance_type": self.deploy_instance_type, } if input_default is not None: initial.update(input_default) return initial def upload_data( self, local_data_file: str, bucket: Optional[str] = None, prefix: Optional[str] = None, ) -> str: """ Uploads the data from the local data file to S3. Returns the location Argument: local_data_file: the location of the data bucket: The bucket to upload to. Defaulted to the own default bucket prefix: The prefix to use to upload to. Defaulted to the own default bucket Returns: The s3 data location """ if bucket is None: bucket = self.bucket if prefix is None: prefix = self.prefix LOGGER.info("Uploading data to S3") return self.session.upload_data(local_data_file, bucket=bucket, key_prefix=prefix) def download_data( self, file_name: str, local_file_directory: str, bucket: Optional[str] = None, prefix: Optional[str] = None, ) -> str: """ Downloads the S3 data and stores it to the local file location. Arguments: file_name: the name of the file local_file_directory: the directory to store the data to bucket: The bucket to upload to. Defaulted to the own default bucket prefix: The prefix to use to upload to. Defaulted to the own default bucket Returns: The local file location. """ s3_client = self.boto_session.client("s3") if prefix is None: prefix = self.prefix key = f"{prefix}/{file_name}" local_file_name = os.path.join(local_file_directory, file_name) LOGGER.info( f"Downloading data from s3: from s3://{self.bucket}/{key} to {local_file_name}" ) if not os.path.exists(local_file_directory): os.makedirs(local_file_directory) s3_client.download_file(Bucket=self.bucket, Key=key, Filename=local_file_name) return local_file_name