def trainModel():

	sess = sagemaker.Session()
	container = get_image_uri(region, 'xgboost')
	
	YColumns = ['result']
	numericalCols = ['guarantee_percentage', 'container_id_label']
	categoricalCols = [ 'component_name', 'slot_names', 'container_type', 'component_namespace',
						'component_display_name', 'customer_targeting', 'site']

	columns_to_keep = YColumns + numericalCols + categoricalCols

	output_path_str = 's3://{}/{}/sagemaker-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
	xgb = sagemaker.estimator.Estimator(container, role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=output_path_str.format(input_bucket, 'results'),
                                    sagemaker_session=sess)

	xgb.set_hyperparameters('objective' : 'multi:softmax',
	    'colsample_bytree' : 0.3,
	    'learning_rate' : 0.3, 
	    'max_depth' : 16,
	    'alpha' : 5,
	    'num_class': 6,
	    'n_estimators' : 200,
	    'num_round': 200)

	input_prefix = 'inputs'
	s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(input_bucket, input_prefix, s3_training_file), content_type='csv')
	s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(input_bucket, input_prefix, s3_training_file), content_type='csv')
	
	xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})
	saveModel(xgb, columns_to_keep)
	return
示例#2
0
    def fetch_data(self, s3_path):

        logger.info('Creating pointers to the files in S3...')

        train_path = s3_path + 'train/' + self.model_name + '_train.csv'

        s3_input_train = sagemaker.s3_input(s3_data=train_path,
                                            content_type='csv')

        val_path = s3_path + 'val/' + self.model_name + '_val.csv'

        s3_input_val = sagemaker.s3_input(s3_data=val_path, content_type='csv')

        return s3_input_train, s3_input_val
def get_training_params(
    model_name,
    job_id,
    role,
    image_uri,
    training_uri,
    validation_uri,
    output_uri,
    hyperparameters,
    kms_key_id,
):
    # Create the estimator
    xgb = sagemaker.estimator.Estimator(
        image_uri,
        role,
        train_instance_count=1,
        train_instance_type="ml.m4.xlarge",
        output_path=output_uri,
    )
    # Set the hyperparameters overriding with any defaults
    params = {
        "max_depth": "9",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "300",
        "subsample": "0.8",
        "objective": "reg:linear",
        "early_stopping_rounds": "10",
        "num_round": "100",
    }
    xgb.set_hyperparameters(**{**params, **hyperparameters})

    # Specify the data source
    s3_input_train = sagemaker.s3_input(s3_data=training_uri,
                                        content_type="csv")
    s3_input_val = sagemaker.s3_input(s3_data=validation_uri,
                                      content_type="csv")
    data = {"train": s3_input_train, "validation": s3_input_val}

    # Get the training request
    request = training_config(xgb, inputs=data, job_name=job_id)
    return {
        "Parameters": {
            "ModelName": model_name,
            "TrainJobId": job_id,
            "TrainJobRequest": json.dumps(request),
            "KmsKeyId": kms_key_id,
        }
    }
示例#4
0
def test_s3_input_all_arguments():
    prefix = "pre"
    distribution = "FullyReplicated"
    compression = "Gzip"
    content_type = "text/csv"
    record_wrapping = "RecordIO"
    s3_data_type = "Manifestfile"
    input_mode = "Pipe"
    result = s3_input(
        s3_data=prefix,
        distribution=distribution,
        compression=compression,
        input_mode=input_mode,
        content_type=content_type,
        record_wrapping=record_wrapping,
        s3_data_type=s3_data_type,
    )
    expected = {
        "DataSource": {
            "S3DataSource": {
                "S3DataDistributionType": distribution,
                "S3DataType": s3_data_type,
                "S3Uri": prefix,
            }
        },
        "CompressionType": compression,
        "ContentType": content_type,
        "RecordWrapperType": record_wrapping,
        "InputMode": input_mode,
    }

    assert result.config == expected
示例#5
0
def test_s3_input_all_arguments():
    prefix = 'pre'
    distribution = 'FullyReplicated'
    compression = 'Gzip'
    content_type = 'text/csv'
    record_wrapping = 'RecordIO'
    s3_data_type = 'Manifestfile'
    result = s3_input(s3_data=prefix,
                      distribution=distribution,
                      compression=compression,
                      content_type=content_type,
                      record_wrapping=record_wrapping,
                      s3_data_type=s3_data_type)
    expected = \
        {'DataSource': {
            'S3DataSource': {
                'S3DataDistributionType': distribution,
                'S3DataType': s3_data_type,
                'S3Uri': prefix,
            }
        },
            'CompressionType': compression,
            'ContentType': content_type,
            'RecordWrapperType': record_wrapping
        }

    assert result.config == expected
def run_test(sagemaker_session,
             ecr_image,
             instance_type,
             framework_version,
             test_data,
             record_wrapper_type=None):
    source_path = os.path.join(os.path.dirname(__file__), '..', '..',
                               'resources', 'pipemode')
    script = os.path.join(source_path, 'pipemode.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True,
                           input_mode='Pipe',
                           hyperparameters={'dimension': DIMENSION})
    input = s3_input(s3_data=test_data,
                     distribution='FullyReplicated',
                     record_wrapping=record_wrapper_type,
                     input_mode='Pipe')
    with timeout(minutes=20):
        estimator.fit(
            {'elizabeth': input},
            job_name=unique_name_from_base('test-sagemaker-pipemode'))
示例#7
0
    def _init_s3_train_files(self):
        """ Initialize the training and validation files (features + label) required for the training step """
        # XGBoost requires libsvm training and validation files when invoking fit()
        self._prepare_libsvm_data()

        if self.n_classes <= 2:
            self.hyperparameters['eval_metric'] = 'auc'
            self.hyperparameters['objective'] = 'binary:logistic'
        else:
            self.hyperparameters['objective'] = 'multi:softprob'
            self.hyperparameters['num_class'] = self.n_classes

        s3_input_training = sagemaker.s3_input(
            s3_data=self.s3_training_libsvm_path, content_type='libsvm')
        s3_input_validation = sagemaker.s3_input(
            s3_data=self.s3_validation_libsvm_path, content_type='libsvm')
        return s3_input_training, s3_input_validation
def sagemakerTrain(event):

    global job_id
    global working_bucket
    global origin
    global hyperparams
    global model_path_prefix
    global train_job_name

    try:
        s3_train_data = sagemaker.s3_input(
            's3://{}/train'.format(working_bucket), content_type='csv')
        s3_valid_data = sagemaker.s3_input(
            's3://{}/validation'.format(working_bucket), content_type='csv')

        logger.info("Initiating Sagemaker training with data from {}".format(
            s3_train_data))

        session = sagemaker.Session()

        # Set up the training for this Algo
        if origin == 'xgboost':
            xgboost = smbuiltin.XGBoost(event)
            container = xgboost.getcontainer(boto3.Session().region_name)
            trainer = xgboost.buildtrainer(container, working_bucket, session)
            # set up the hyperparameters
            trainer = xgboost.sethyperparameters(trainer, hyperparams)

            trainer.fit({
                'train': s3_train_data,
                'validation': s3_valid_data
            },
                        wait=False)
            train_job_name = trainer.latest_training_job.name
            logger.info("training job unique id = {}".format(train_job_name))
            model_path_prefix = "/" + train_job_name + "/output"
        else:
            logger.error(
                "sagemakerTrain::Serverlesss Sagemaker process does not support the {} algorithm."
                .format(origin))

    except Exception as err:
        logger.error(
            "sagemakerTrain::Error while launching SageMaker training: {}".
            format(err))
示例#9
0
    def _init_s3_train_files(self):
        """ Initialize the training and validation files (features + label) required for the training step """
        # LinearModel requires CSV training and validation files, including labels, when invoking fit()
        logging.info('Preparing csv training data...')
        self._prepare_csv_full_file(self.training,
                                    self.csv_training_full_filename)
        self._prepare_csv_full_file(self.validation,
                                    self.csv_validation_full_filename)

        self.hyperparameters['feature_dim'] = self.training_x.shape[1]
        self.hyperparameters['num_classes'] = self.n_classes
        # binary_classifier, multiclass_classifier, or regressor
        if 'predictor_type' not in self.hyperparameters:
            self.hyperparameters[
                'predictor_type'] = 'binary_classifier' if self.n_classes == 2 else 'multiclass_classifier'

        s3_input_training = sagemaker.s3_input(
            s3_data=self.s3_training_full_csv_path, content_type='text/csv')
        s3_input_validation = sagemaker.s3_input(
            s3_data=self.s3_validation_full_csv_path, content_type='text/csv')
        return s3_input_training, s3_input_validation
def create_model(image: str, hyperparameters: dict, instance_type: str,
                 output_path: str, region_name: str, role: str, s3_train: str,
                 s3_validation: str, job_name: str):
    if image == 'xgboost':
        container = get_image_uri(region_name, image, '0.90-2')
    else:
        container = get_image_uri(region_name, image)
    save_interval = '1'
    model = sagemaker.estimator.Estimator(
        container,
        role=role,
        train_instance_count=1,
        train_instance_type=instance_type,
        train_use_spot_instances=True,
        train_max_run=300,
        train_max_wait=600,
        output_path=output_path,
        debugger_hook_config=DebuggerHookConfig(
            s3_output_path=f's3://{bucket}/{prefix}/debug',
            collection_configs=[
                CollectionConfig(name='metrics',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='feature_importance',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='full_shap',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='average_shap',
                                 parameters={'save_interval': save_interval})
            ]),
        rules=[
            Rule.sagemaker(rule_configs.class_imbalance(),
                           rule_parameters={'collection_names': 'metrics'})
        ])
    model.set_hyperparameters(**hyperparameters)
    data_channel = {
        'train': s3_input(s3_train, content_type='text/csv'),
        'validation': s3_input(s3_validation, content_type='text/csv')
    }
    model.fit(data_channel, job_name=job_name)
    return model
示例#11
0
def test_s3_input_all_defaults():
    prefix = "pre"
    actual = s3_input(s3_data=prefix)
    expected = {
        "DataSource": {
            "S3DataSource": {
                "S3DataDistributionType": "FullyReplicated",
                "S3DataType": "S3Prefix",
                "S3Uri": prefix,
            }
        }
    }
    assert actual.config == expected
示例#12
0
def test_s3_input_all_defaults():
    prefix = 'pre'
    actual = s3_input(s3_data=prefix)
    expected = \
        {'DataSource': {
            'S3DataSource': {
                'S3DataDistributionType': 'FullyReplicated',
                'S3DataType': 'S3Prefix',
                'S3Uri': prefix
            }
        }
        }
    assert actual.config == expected
def test_s3_input_all_defaults():
    prefix = 'pre'
    actual = s3_input(s3_data=prefix)
    expected = {
        'DataSource': {
            'S3DataSource': {
                'S3DataDistributionType': 'FullyReplicated',
                'S3DataType': 'S3Prefix',
                'S3Uri': prefix
            }
        }
    }
    assert actual.config == expected
示例#14
0
def sagemaker_predict(population, plpData, bucket, prefix, container, role_arn,
                      model_name):
    s3 = boto3.resource('s3')
    sess = sagemaker.Session()
    print("Applying Python Model")
    ###########################################################################
    print("Loading Data...")
    # load data + train,test indexes + validation index
    X = plpData[population[:, 0], :]

    # load index file
    print("population loaded- %s rows and %s columns" %
          (np.shape(population)[0], np.shape(population)[1]))
    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
    print("Data ready for model has %s features" % (np.shape(X)[1]))
    # load model
    print("loading model...")
    model_url = 's3://{}/plpModel/model.tar.gz'.format(bucket)

    #role = sagemaker.get_execution_role()
    role = role_arn
    container = container
    sm_client = boto3.client('sagemaker')

    primary_container = {'Image': container, 'ModelDataUrl': model_url}

    create_model_response = sm_client.create_model(
        ModelName=model_name,
        ExecutionRoleArn=role,
        PrimaryContainer=primary_container)

    print(X.shape)
    print("Calculating predictions on population...")
    np.savetxt('pred.csv', X.todense(), delimiter=',', fmt='%i')

    pred_s3 = sess.upload_data('pred.csv', bucket=bucket, key_prefix=prefix)
    pred_s3 = sagemaker.s3_input(s3_data=pred_s3, content_type='text/csv')

    transformer = sagemaker.transformer.Transformer(
        base_transform_job_name='Batch-Transform',
        model_name=model_name,
        instance_count=1,
        instance_type='ml.m4.xlarge',
        output_path='s3://{}/{}'.format(bucket, prefix))
    transformer.transform('s3://{}/{}/pred.csv'.format(bucket, prefix),
                          content_type='text/csv',
                          split_type='Line')
    transformer.wait()

    return
示例#15
0
def generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session):
    s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/train.csv'\
                                        .format(bucket, prefix, ticker), content_type='text/csv')
    s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/validation.csv'\
                                             .format(bucket, prefix, ticker), content_type='text/csv')
    estimator = PyTorch(
        entry_point='train.py',
        source_dir='pytorch',  # this should be just "source" for your code
        role=role,
        framework_version='1.0',
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'input_dim': 26,  # num of features
            'hidden_dim': 260,
            'output_dim': 1,
            'epochs': 200  # could change to higher
        })
    estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})
    predictor = estimator.deploy(initial_instance_count=1,
                                 instance_type="ml.m4.xlarge")
    return predictor
示例#16
0
def get_training_params(model_name, job_id, role, image_uri, training_uri,
                        validation_uri, output_uri, hyperparameters):
    # Create the estimator
    xgb = sagemaker.estimator.Estimator(image_uri,
                                        role,
                                        train_instance_count=1,
                                        train_instance_type='ml.m4.xlarge',
                                        output_path=output_uri)
    # Set the hyperparameters overriding with any defaults
    params = {
        'max_depth': '9',
        'eta': '0.2',
        'gamma': '4',
        'min_child_weight': '300',
        'subsample': '0.8',
        'objective': 'reg:linear',
        'early_stopping_rounds': '10',
        'num_round': '100'
    }
    xgb.set_hyperparameters(**{**params, **hyperparameters})

    # Specify the data source
    s3_input_train = sagemaker.s3_input(s3_data=training_uri,
                                        content_type='csv')
    s3_input_val = sagemaker.s3_input(s3_data=validation_uri,
                                      content_type='csv')
    data = {'train': s3_input_train, 'validation': s3_input_val}

    # Get the training request
    request = training_config(xgb, inputs=data, job_name=job_id)
    return {
        "Parameters": {
            "ModelName": model_name,
            "TrainJobId": job_id,
            "TrainJobRequest": json.dumps(request),
        }
    }
示例#17
0
def test_s3_input_all_defaults(caplog):
    prefix = "pre"
    actual = s3_input(s3_data=prefix)
    expected = {
        "DataSource": {
            "S3DataSource": {
                "S3DataDistributionType": "FullyReplicated",
                "S3DataType": "S3Prefix",
                "S3Uri": prefix,
            }
        }
    }
    assert actual.config == expected

    warning_message = (
        "'s3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2."
    )
    assert warning_message in caplog.text
def test_s3_input_all_arguments():
    prefix = 'pre'
    distribution = 'FullyReplicated'
    compression = 'Gzip'
    content_type = 'text/csv'
    record_wrapping = 'RecordIO'
    s3_data_type = 'Manifestfile'
    result = s3_input(s3_data=prefix, distribution=distribution, compression=compression,
                      content_type=content_type, record_wrapping=record_wrapping, s3_data_type=s3_data_type)
    expected = \
        {'DataSource': {
            'S3DataSource': {
                'S3DataDistributionType': distribution,
                'S3DataType': s3_data_type,
                'S3Uri': prefix,
            }
        },
            'CompressionType': compression,
            'ContentType': content_type,
            'RecordWrapperType': record_wrapping
        }

    assert result.config == expected
示例#19
0
    'algorithms_lab/xgboost_validation/validation.csv').upload_file(
        'validation.csv')

# # Step 3: Creating and training our model (XGBoost)
# This is where the magic happens. We will get the ECR container hosted in ECR for the XGBoost algorithm.

# This is where the magic happens. We will get the ECR container hosted in ECR for the XGBoost algorithm.

from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

#
# Next, because we're training with the CSV file format, we'll create inputs that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV.

s3_input_train = sagemaker.s3_input(
    s3_data='s3://{}/algorithms_lab/xgboost_train'.format(bucket),
    content_type='csv')
s3_input_validation = sagemaker.s3_input(
    s3_data='s3://{}/algorithms_lab/xgboost_validation'.format(bucket),
    content_type='csv')

#
# Next we start building out our model by using the SageMaker Python SDK and passing in everything that is required to create a XGBoost model.
#
# First I like to always create a specific job name.
#
# Next, we'll need to specify training parameters.
#
# The xgboost algorithm container
# The IAM role to use
# Training instance type and count
示例#20
0
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

# %%
"""
Now that we have our estimator object completely set up, it is time to train it. To do this we make sure that SageMaker knows our input data is in csv format and then execute the `fit` method.
"""

# %%
# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

# %%
"""
## Step 5: Test the model

Now that we have fit our model to the training data, using the validation data to avoid overfitting, we can test our model. To do this we will make use of SageMaker's Batch Transform functionality. To start with, we need to build a transformer object from our fit model.
"""

# %%
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

# %%
from sagemaker.predictor import csv_serializer


data = pd.read_csv('amazon_locker_dataset.csv', sep=',', encoding='latin1')

data=data.iloc[:,1:]
data = pd.concat([data['QoS(S)'], data.drop(['QoS(S)'], axis=1)], axis=1) 
data=np.array(data.iloc[:,:]).astype('float32')
np.savetxt("train.csv",data, delimiter=",")

containers = {'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest'}

role = get_execution_role()

sess = sagemaker.Session()
bucket = "model-artefacts-sagemaker"
prefix = "model2/test"

key = 'xgboost'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_file('train.csv')
s3_train_data = sagemaker.s3_input(s3_data='s3://{}/{}/train/{}'.format(bucket, prefix, key), content_type='csv')
print(s3_train_data)

xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name], role, train_instance_count=1, train_instance_type='ml.m4.xlarge', output_path='s3://{}/{}/output'.format(bucket, prefix),sagemaker_session=sess)

xgb.set_hyperparameters(eta=0.1, objective='reg:linear', num_round=25)

xgb.fit({'train': s3_train_data})

xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')
示例#22
0
# In[6]:

train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729),
                                 [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

# In[7]:

pd.concat([train_data['y_yes'],
           train_data.drop(['y_no', 'y_yes'], axis=1)],
          axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(
    os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(
    bucket_name, prefix),
                                    content_type='csv')

# In[ ]:

# In[8]:

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(
                                        bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,
示例#23
0
    'Extract, Transform, Load',
    parameters={"JobName": job_name,
                "Arguments":{
                    '--S3_SOURCE': data_source,
                    '--S3_DEST': 's3a://{}/{}/'.format(bucket, project_name),
                    '--TRAIN_KEY': train_prefix + '/',
                    '--VAL_KEY': val_prefix +'/'}
               }
)


training_step = steps.TrainingStep(
    'Model Training', 
    estimator=xgb,
    data={
        'train': s3_input(train_data, content_type='csv'),
        'validation': s3_input(validation_data, content_type='csv')
    },
    job_name=training_job_name,
    wait_for_completion=True
)

model_step = steps.ModelStep(
    'Save Model',
    model=training_step.get_expected_model(),
    model_name=execution_input['ModelName'],
    result_path='$.ModelStepResults'
)

lambda_step = steps.compute.LambdaStep(
    'Query Training Results',
示例#24
0
# ### Update the code cell

# In[25]:

# set the hyperparameters
xgb.set_hyperparameters(objective="binary:logistic", num_round=1)

# Since we're training with CSV file format, we'll create [`s3_input`](https://sagemaker.readthedocs.io/en/latest/session.html?highlight=sagemaker.session.s3_input) objects that our training function can use as a pointer to the files type and location in S3.
#
# Run the following for the training data input and the validation data sets:

# In[26]:

# Configuring the data inputs
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(
    bucket, prefix),
                                    content_type='csv')
s3_input_validation = sagemaker.s3_input(
    s3_data='s3://{}/{}/validation'.format(bucket, prefix), content_type='csv')

# Finally we are ready to train.
# To train use the [xgb.fit()](https://sagemaker.readthedocs.io/en/latest/estimators.html) function.

# In[27]:

# Traing the model
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

# Pay attention to the final validation-error. A lower value is better.

# ---
示例#25
0
    'TrainLocation': str,
    'ValidationLocation': str,
    'EndpointName': str
})
execution_params = {
    'TrainLocation': input_train_path,
    'ValidationLocation': input_validation_path,
    'EndpointName': endpoint_name
}

training_step = steps.TrainingStep(
    'Train Step',
    estimator=xgb,
    data={
        'train':
        sagemaker.s3_input(execution_input['TrainLocation'],
                           content_type='libsvm'),
        'validation':
        sagemaker.s3_input(execution_input['ValidationLocation'],
                           content_type='libsvm')
    },
    job_name=job_name  # Require embedding this to job_name matches uploaded code
)

model_step = steps.ModelStep('Save model',
                             model=training_step.get_expected_model(),
                             model_name=job_name)

endpoint_config_step = steps.EndpointConfigStep("Create Endpoint Config",
                                                endpoint_config_name=job_name,
                                                model_name=job_name,
                                                initial_instance_count=1,
pd.concat([
    train_data['LV ActivePower (kW)'],
    train_data.drop(['LV ActivePower (kW)'], axis=1)
],
          axis=1).to_csv('Turbo_Train.csv', index=False, header=False)

pd.concat([
    train_data['LV ActivePower (kW)'],
    train_data.drop(['LV ActivePower (kW)'], axis=1)
],
          axis=1).to_csv('Turbo_Test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(
    os.path.join(prefix, 'Test/Turbo_Test.csv')).upload_file('Turbo_Test.csv')
s3_input_Test = sagemaker.s3_input(s3_data="s3://{}/{}/Test/Turbo_Test".format(
    bucket_name, prefix),
                                   content_type="csv")

containers = {
    'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
    'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
    'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
    'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'
}

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m5.large',
                                    output_path=output_path,
def train_sagemaker(population, plpData, classifier, hyperParameters,
                    container, bucket, s3_output, role_arn, prefix, job_name,
                    modelOutput):
    print("Training Sagemaker model ")
    s3 = boto3.resource('s3')
    sess = sagemaker.Session()

    y = population[:, 1]
    X = plpData[population[:, 0].astype(int), :]
    trainInds = population[:, population.shape[1] - 1] > 0

    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
    print("population loaded- %s rows and %s columns" %
          (np.shape(population)[0], np.shape(population)[1]))
    ###########################################################################

    np.savetxt('train.csv',
               scipy.sparse.hstack(
                   (y[trainInds][:, None], X[trainInds, :])).todense(),
               delimiter=',',
               fmt='%i')
    np.savetxt('test.csv', X[trainInds, :].todense(), delimiter=',', fmt='%i')

    train_s3 = sess.upload_data(path='train.csv',
                                bucket=bucket,
                                key_prefix=prefix)
    test_s3 = sess.upload_data(path='test.csv',
                               bucket=bucket,
                               key_prefix=prefix)

    train_s3 = sagemaker.s3_input(s3_data=train_s3, content_type='text/csv')
    test_s3 = sagemaker.s3_input(s3_data=test_s3, content_type='text/csv')

    estimator = sagemaker.estimator.Estimator(
        image_name=container,
        role=role_arn,
        train_instance_count=1L,
        train_instance_type='ml.m5.large',
        train_volume_size=30L,
        train_max_run=3600L,
        input_mode='File',
        output_path=s3_output)
    if classifier == 'linear-learner':
        estimator.set_hyperparameters(feature_dim=X[trainInds, :].shape[1],
                                      predictor_type='binary_classifier',
                                      mini_batch_size=100)
    if classifier == 'xgboost':
        estimator.set_hyperparameters(num_round=10L)
    if classifier == 'knn':
        k = 1000
        if hyperParameters is not None:
            if hyperParameters["k"] is not None:
                k = int(hyperParameters["k"])
        estimator.set_hyperparameters(feature_dim=X[trainInds, :].shape[1],
                                      predictor_type='classifier',
                                      k=k,
                                      sample_size=X[trainInds, :].shape[0])

    input_data = {"train": train_s3}
    estimator.fit(inputs=input_data, job_name=job_name)

    transformer = estimator.transformer(
        instance_count=1,
        instance_type='ml.m4.xlarge',
        strategy='MultiRecord',
        assemble_with='Line',
        output_path='s3://{}/prediction'.format(bucket))
    transformer.transform('s3://{}/data/test.csv'.format(bucket),
                          content_type='text/csv',
                          split_type='Line')
    transformer.wait()

    # save the model:
    if not os.path.exists(modelOutput):
        os.makedirs(modelOutput)
    print("Model saved to: %s" % (modelOutput))

    modelkey = os.path.join('output', job_name, 'output/model.tar.gz')
    s3.Bucket(bucket).download_file(modelkey,
                                    os.path.join(modelOutput, 'model.tar.gz'))

    return True
示例#28
0
testX['target'] = y_test

trainX.head()

# convert to CSV so SM can consume
trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')
ntrain_csv = reformat_csv('boston_train.csv')
ntest_csv = reformat_csv('boston_test.csv')

# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path=ntrain_csv, bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

s3_input_train = sagemaker.s3_input(s3_data=trainpath, content_type='csv')

testpath = sess.upload_data(
    path=ntest_csv, bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

s3_input_validation = sagemaker.s3_input(s3_data=testpath, content_type='csv')

container = get_image_uri(region, 'xgboost', '0.90-1')
"""

max_depth controls how deep each tree within the algorithm can be built. 
    Deeper trees can lead to better fit, but are more computationally expensive and can lead to overfitting. There is typically some trade-off in model performance that needs to be explored between a large number of shallow trees and a smaller number of deeper trees.
subsample controls sampling of the training data. 
    This technique can help reduce overfitting, but setting it too low can also starve the model of data.
num_round controls the number of boosting rounds.