def test_randomcutforest(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("randomcutforest") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): # Generate a thousand 14-dimensional datapoints. feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_trees=50, num_samples_per_tree=20, eval_metrics=["accuracy", "precision_recall_fscore"], sagemaker_session=sagemaker_session, ) rcf.fit(records=rcf.record_set(train_input), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = RandomCutForestModel(rcf.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["score"] is not None assert len(record.label["score"].float32_tensor.values) == 1
def test_randomcutforest(sagemaker_session): with timeout(minutes=15): # Generate a thousand 14-dimensional datapoints. feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, base_job_name='test-randomcutforest') rcf.fit(rcf.record_set(train_input)) endpoint_name = name_from_base('randomcutforest') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): model = RandomCutForestModel(rcf.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["score"] is not None assert len(record.label["score"].float32_tensor.values) == 1
def create_model(training_data, s3_bucket): session = sagemaker.Session() endpoints = sagemaker_client.list_endpoints()["Endpoints"] endpoint_exists = False existing_endpoint_name = "" for e in endpoints: if e["EndpointName"].startswith("randomcutforest"): existing_endpoint_name = e["EndpointName"] endpoint_exists = True break rcf = RandomCutForest( role=ROLE_ARN, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, data_location="s3://{}/{}/".format(s3_bucket, INPUT_DATA_PREFIX), output_path="s3://{}/{}/output".format(s3_bucket, MODEL_OUTPUT_PREFIX), num_samples_per_tree=SAMPLES_PER_TREE, num_trees=NUM_OF_TREES, ) numpy_data = training_data.to_numpy() record_set = rcf.record_set(numpy_data, channel="train", encrypt=False) rcf.fit(record_set) if endpoint_exists: response = update_model(rcf, existing_endpoint_name) else: response = deploy_model(rcf) return response
def test_randomcutforest(sagemaker_session): with timeout(minutes=15): # Generate a thousand 14-dimensional datapoints. feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, base_job_name='test-randomcutforest') rcf.fit(rcf.record_set(train_input)) endpoint_name = name_from_base('randomcutforest') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): model = RandomCutForestModel(rcf.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["score"] is not None assert len(record.label["score"].float32_tensor.values) == 1
def inference(self, **kwargs): self.bucket = kwargs.get('bucket') self.prefix = kwargs.get('prefix') self.execution_role = kwargs.get('execution_role') self.instance_type = kwargs.get('instance_type') self.aws_access_key_id = kwargs.get('aws_access_key_id') self.aws_secret_access_key = kwargs.get('aws_secret_access_key') self.region_name = kwargs.get('region_name') print("//////boto3 session generating") boto_session = boto3.Session( aws_access_key_id = self.aws_access_key_id, aws_secret_access_key = self.aws_secret_access_key, region_name = self.region_name ) # check if the bucket exists print("\n//////check if the bucket exists") try: boto_session.client('s3').head_bucket(Bucket=self.bucket) except botocore.exceptions.ParamValidationError as e: print('Hey! You either forgot to specify your S3 bucket' ' or you gave your bucket an invalid name!') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == '403': print("Hey! You don't have permission to access the bucket, {}.".format(self.bucket)) elif e.response['Error']['Code'] == '404': print("Hey! Your bucket, {}, doesn't exist!".format(self.bucket)) else: raise else: print('Training input/output will be stored in: s3://{}/{}'.format(self.bucket, self.prefix)) print("\n//////define sagemaker session") sg_session = sagemaker.Session(boto_session) print("\n//////define rcf model") # specify general training job information rcf = RandomCutForest(role=self.execution_role, train_instance_count=1, train_instance_type=self.instance_type, data_location='s3://{}/{}/'.format(self.bucket, self.prefix), output_path='s3://{}/{}/output'.format(self.bucket, self.prefix), num_samples_per_tree=512, num_trees=50, sagemaker_session = sg_session) print("\n//////fitting rcf model") # automatically upload the training data to S3 and run the training job rcf.fit(rcf.record_set(self.df.value.as_matrix().reshape(-1,1))) print("\n//////infer the virtual data") rcf_inference = rcf.deploy( initial_instance_count=1, instance_type=self.instance_type, ) print("\n//////serialize the output data") rcf_inference.content_type = 'text/csv' rcf_inference.serializer = csv_serializer rcf_inference.accept = 'application/json' rcf_inference.deserializer = json_deserializer df_numpy = self.df.value.as_matrix().reshape(-1,1) self.results = rcf_inference.predict(df_numpy)