def test_ntm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "ntm") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) ntm = NTM( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) records = prepare_record_set_from_local_files( data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session ) training_config = _build_airflow_workflow( estimator=ntm, instance_type=cpu_instance_type, inputs=records ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def test_ntm(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ntm') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-ntm') record_set = prepare_record_set_from_local_files(data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session) ntm.fit(record_set, None) endpoint_name = unique_name_from_base('ntm') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_weights"] is not None
def test_lda(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("lda") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) lda = LDA( role="SageMakerRole", instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session ) lda.fit(records=record_set, mini_batch_size=100, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = LDAModel(lda.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def test_lda(sagemaker_session): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) lda.fit(record_set, 100) endpoint_name = name_from_base('lda') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def test_ntm(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ntm') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-ntm') record_set = prepare_record_set_from_local_files(data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session) ntm.fit(record_set, None) endpoint_name = name_from_base('ntm') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_weights"] is not None
def test_ntm_serverless_inference(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ntm-serverless") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "ntm") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) ntm = NTM( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session ) ntm.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = NTMModel(ntm.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy( serverless_inference_config=ServerlessInferenceConfig(), endpoint_name=job_name ) assert isinstance(predictor, Predictor)
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = { 'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2) } objective_metric_name = 'test:pwll' tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = {'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2)} objective_metric_name = 'test:pwll' tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_tuning_lda(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features["values"].float32_tensor.shape[0]) lda = LDA( role="SageMakerRole", instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = "test" # specify which hp you want to optimize over hyperparameter_ranges = { "alpha0": ContinuousParameter(1, 10), "num_topics": IntegerParameter(1, 2), } objective_metric_name = "test:pwll" tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type="Maximize", max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) tuning_job_name = unique_name_from_base("test-lda", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit([record_set, test_record_set], mini_batch_size=1, job_name=tuning_job_name) attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" assert attached_tuner.estimator.alpha0 == 1.0 assert attached_tuner.estimator.num_topics == 1 best_training_job = attached_tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def transformation(): """Do an inference on a single batch of data. In this server, we take data as JSON, convert it to a sparse array for internal use and then convert the predictions back to json. Input format is: '{"instances": [{"keys": ["User","1","2"], "values": ["a","b","c"]}, {"keys": ["User","5","6"], "values": ["d","e","f"]}]}' """ # Convert from json to numpy te_row_ind = [] te_col_ind = [] te_data = [] te_idx = 0 headers = ScoringService.get_headers() if flask.request.content_type == 'application/json': print("Working with JSON input") s = flask.request.data.decode('utf-8') inputs = json.loads(s) for instance in inputs['instances']: # The column index has to be found from the headers for col_idx in range(0, len(instance['keys'])): key = instance['keys'][col_idx] val = instance['values'][col_idx] item_to_find = "{0}_{1}".format(key, val) try: te_col_ind.append(headers.index(item_to_find)) te_data.append(1.0) te_row_ind.append(te_idx) except Exception as e: te_col_ind.append(1) te_data.append(0.0) te_row_ind.append(te_idx) print("Couldn't find header for {0}".format(item_to_find)) te_idx = te_idx + 1 elif flask.request.content_type == 'application/x-recordio-protobuf': print("Working with Protobuf input") #print("{0}".format(flask.request.stream)) #s = flask.request.data.decode('latin-1') #print("Data: {}".format(s)) test_records = smac.read_records(StringIO.StringIO(flask.request.data)) num_test_samples = len(test_records) for test_record in test_records: te_row_ind.extend([te_idx] * len(test_record.features['values'].float32_tensor.values)) te_col_ind.extend(test_record.features['values'].float32_tensor.keys) te_data.extend(test_record.features['values'].float32_tensor.values) te_idx = te_idx + 1 else: return flask.Response(response='This predictor only supports JSON or Protobuf data', status=415, mimetype='text/plain') X_te_sparse = sp.csr_matrix( (np.array(te_data),(np.array(te_row_ind),np.array(te_col_ind))), shape=(te_idx,ScoringService.get_num_features()) ) print('Invoked with {} records'.format(X_te_sparse.shape)) # Do the prediction predictions = ScoringService.predict(X_te_sparse) # Convert from array back to json result = None if flask.request.content_type == 'application/json': js = {'predictions': []} for pred_value in predictions: js['predictions'].append({'score': str(pred_value)}) result = json.dumps(js) else: # convert to protobuf buf = io.BytesIO() record = Record() for pred_value in predictions: record.Clear() #smac._write_label_tensor('Float32', record, pred_value) record.label["score"].float32_tensor.values.extend([pred_value]) smac._write_recordio(buf, record.SerializeToString()) buf.seek(0) result = buf.getvalue() return flask.Response(response=result, status=200, mimetype=flask.request.content_type)