def test_randomcutforest(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("randomcutforest")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        # Generate a thousand 14-dimensional datapoints.
        feature_num = 14
        train_input = np.random.rand(1000, feature_num)

        rcf = RandomCutForest(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            num_trees=50,
            num_samples_per_tree=20,
            eval_metrics=["accuracy", "precision_recall_fscore"],
            sagemaker_session=sagemaker_session,
        )

        rcf.fit(records=rcf.record_set(train_input), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = RandomCutForestModel(rcf.model_data,
                                     role="SageMakerRole",
                                     sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["score"] is not None
            assert len(record.label["score"].float32_tensor.values) == 1
예제 #2
0
def test_randomcutforest(sagemaker_session):
    with timeout(minutes=15):
        # Generate a thousand 14-dimensional datapoints.
        feature_num = 14
        train_input = np.random.rand(1000, feature_num)

        rcf = RandomCutForest(role='SageMakerRole',
                              train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              num_trees=50,
                              num_samples_per_tree=20,
                              sagemaker_session=sagemaker_session,
                              base_job_name='test-randomcutforest')

        rcf.fit(rcf.record_set(train_input))

    endpoint_name = name_from_base('randomcutforest')
    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=20):
        model = RandomCutForestModel(rcf.model_data,
                                     role='SageMakerRole',
                                     sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["score"] is not None
            assert len(record.label["score"].float32_tensor.values) == 1
예제 #3
0
def create_model(training_data, s3_bucket):
    session = sagemaker.Session()
    endpoints = sagemaker_client.list_endpoints()["Endpoints"]
    endpoint_exists = False
    existing_endpoint_name = ""
    for e in endpoints:
        if e["EndpointName"].startswith("randomcutforest"):
            existing_endpoint_name = e["EndpointName"]
            endpoint_exists = True
            break
    rcf = RandomCutForest(
        role=ROLE_ARN,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        data_location="s3://{}/{}/".format(s3_bucket, INPUT_DATA_PREFIX),
        output_path="s3://{}/{}/output".format(s3_bucket, MODEL_OUTPUT_PREFIX),
        num_samples_per_tree=SAMPLES_PER_TREE,
        num_trees=NUM_OF_TREES,
    )
    numpy_data = training_data.to_numpy()
    record_set = rcf.record_set(numpy_data, channel="train", encrypt=False)
    rcf.fit(record_set)
    if endpoint_exists:
        response = update_model(rcf, existing_endpoint_name)
    else:
        response = deploy_model(rcf)
    return response
def test_randomcutforest(sagemaker_session):
    with timeout(minutes=15):
        # Generate a thousand 14-dimensional datapoints.
        feature_num = 14
        train_input = np.random.rand(1000, feature_num)

        rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge',
                              num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session,
                              base_job_name='test-randomcutforest')

        rcf.fit(rcf.record_set(train_input))

    endpoint_name = name_from_base('randomcutforest')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
        model = RandomCutForestModel(rcf.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["score"] is not None
            assert len(record.label["score"].float32_tensor.values) == 1
예제 #5
0
    def inference(self, **kwargs):

        self.bucket = kwargs.get('bucket')
        self.prefix = kwargs.get('prefix')
        self.execution_role = kwargs.get('execution_role')
        self.instance_type = kwargs.get('instance_type')
        self.aws_access_key_id = kwargs.get('aws_access_key_id')
        self.aws_secret_access_key = kwargs.get('aws_secret_access_key')
        self.region_name = kwargs.get('region_name')
        
        
        print("//////boto3 session generating")
        boto_session = boto3.Session(
            aws_access_key_id = self.aws_access_key_id,
            aws_secret_access_key = self.aws_secret_access_key,
            region_name = self.region_name
        )

        # check if the bucket exists
        print("\n//////check if the bucket exists")    
        try:
            boto_session.client('s3').head_bucket(Bucket=self.bucket)
        except botocore.exceptions.ParamValidationError as e:
            print('Hey! You either forgot to specify your S3 bucket'
                  ' or you gave your bucket an invalid name!')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '403':
                print("Hey! You don't have permission to access the bucket, {}.".format(self.bucket))
            elif e.response['Error']['Code'] == '404':
                print("Hey! Your bucket, {}, doesn't exist!".format(self.bucket))
            else:
                raise
        else:
            print('Training input/output will be stored in: s3://{}/{}'.format(self.bucket, self.prefix))

        print("\n//////define sagemaker session")    
        sg_session = sagemaker.Session(boto_session)

        print("\n//////define rcf model")    
        # specify general training job information
        rcf = RandomCutForest(role=self.execution_role,
                              train_instance_count=1,
                              train_instance_type=self.instance_type,
                              data_location='s3://{}/{}/'.format(self.bucket, self.prefix),
                              output_path='s3://{}/{}/output'.format(self.bucket, self.prefix),
                              num_samples_per_tree=512,
                              num_trees=50,
                              sagemaker_session = sg_session)

        print("\n//////fitting rcf model")    
        # automatically upload the training data to S3 and run the training job
        rcf.fit(rcf.record_set(self.df.value.as_matrix().reshape(-1,1)))

        print("\n//////infer the virtual data")    
        rcf_inference = rcf.deploy(
            initial_instance_count=1,
            instance_type=self.instance_type,
        )

        print("\n//////serialize the output data")    
        rcf_inference.content_type = 'text/csv'
        rcf_inference.serializer = csv_serializer
        rcf_inference.accept = 'application/json'
        rcf_inference.deserializer = json_deserializer

        df_numpy = self.df.value.as_matrix().reshape(-1,1)
        self.results = rcf_inference.predict(df_numpy)