예제 #1
0
def test_deserializer():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    s = numpy_to_record_serializer()
    buf = s(np.array(array_data))
    d = record_deserializer()
    for record, expected in zip(d(buf, 'who cares'), array_data):
        assert record.features["values"].float64_tensor.values == expected
예제 #2
0
 def __init__(self, endpoint, sagemaker_session=None):
     super(LDAPredictor, self).__init__(
         endpoint,
         sagemaker_session,
         serializer=numpy_to_record_serializer(),
         deserializer=record_deserializer(),
     )
예제 #3
0
def test_deserializer():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    s = numpy_to_record_serializer()
    buf = s(np.array(array_data))
    d = record_deserializer()
    for record, expected in zip(d(buf, 'who cares'), array_data):
        assert record.features["values"].float64_tensor.values == expected
예제 #4
0
def test_serializer_accepts_one_dimensional_array():
    s = numpy_to_record_serializer()
    array_data = [1.0, 2.0, 3.0]
    buf = s(np.array(array_data))
    record_data = next(_read_recordio(buf))
    record = Record()
    record.ParseFromString(record_data)
    assert record.features["values"].float64_tensor.values == array_data
예제 #5
0
def test_serializer():
    s = numpy_to_record_serializer()
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    buf = s(np.array(array_data))
    for record_data, expected in zip(_read_recordio(buf), array_data):
        record = Record()
        record.ParseFromString(record_data)
        assert record.features["values"].float64_tensor.values == expected
예제 #6
0
def test_serializer():
    s = numpy_to_record_serializer()
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    buf = s(np.array(array_data))
    for record_data, expected in zip(_read_recordio(buf), array_data):
        record = Record()
        record.ParseFromString(record_data)
        assert record.features["values"].float64_tensor.values == expected
def test_serializer_accepts_one_dimensional_array():
    s = numpy_to_record_serializer()
    array_data = [1.0, 2.0, 3.0]
    buf = s(np.array(array_data))
    record_data = next(read_recordio(buf))
    record = Record()
    record.ParseFromString(record_data)
    assert record.features["values"].float64_tensor.values == array_data
예제 #8
0
 def __init__(self, endpoint, sagemaker_session=None):
     """
     Args:
         endpoint:
         sagemaker_session:
     """
     super(LinearLearnerPredictor, self).__init__(
         endpoint,
         sagemaker_session,
         serializer=numpy_to_record_serializer(),
         deserializer=record_deserializer(),
     )
 def __init__(self, endpoint, sagemaker_session=None):
     """
     Args:
         endpoint:
         sagemaker_session:
     """
     super(FactorizationMachinesPredictor, self).__init__(
         endpoint,
         sagemaker_session,
         serializer=numpy_to_record_serializer(),
         deserializer=record_deserializer(),
     )
def predict():
    """
    Provide this endpoint an image in jpeg format.
    The image should be equal in size to the training images (28x28).
    """
    img = Image.open(BytesIO(app.current_request.raw_body)).convert("L")
    img_arr = np.array(img, dtype=np.float32)
    runtime = boto3.Session().client(service_name="sagemaker-runtime",
                                     region_name="eu-west-1")
    response = runtime.invoke_endpoint(
        EndpointName="mnistclassifier",
        ContentType="application/x-recordio-protobuf",
        Body=numpy_to_record_serializer()(img_arr.flatten()),
    )
    result = json.loads(response["Body"].read().decode("utf-8"))
    return Response(result,
                    status_code=200,
                    headers={"Content-Type": "application/json"})
예제 #11
0
def convert_and_upload_training_data(ndarray,
                                     bucket,
                                     prefix,
                                     filename='data.pbr'):
    import boto3
    import os
    from sagemaker.amazon.common import numpy_to_record_serializer

    # convert Numpy array to Protobuf RecordIO format
    serializer = numpy_to_record_serializer()
    data = shingle(ndarray)
    buffer = serializer(data.astype(np.int32))

    # upload to S3
    s3_object = os.path.join(prefix, 'train', filename)
    boto3.Session().resource('s3').Bucket(bucket).Object(
        s3_object).upload_fileobj(buffer)
    s3_path = 's3://{}/{}'.format(bucket, s3_object)
    return s3_path
 def __init__(self, endpoint, sagemaker_session=None):
     super(LinearLearnerPredictor, self).__init__(endpoint, sagemaker_session,
                                                  serializer=numpy_to_record_serializer(),
                                                  deserializer=record_deserializer())
 def __init__(self, endpoint, sagemaker_session=None):
     super(FactorizationMachinesPredictor, self).__init__(endpoint,
                                                          sagemaker_session,
                                                          serializer=numpy_to_record_serializer(),
                                                          deserializer=record_deserializer())
import boto3
import pandas
import os
import json
import logging
import sys
from sagemaker.amazon.common import numpy_to_record_serializer
logger = logging.getLogger()
logger.setLevel(logging.INFO)

s3 = boto3.resource('s3')
client = boto3.client('s3')
serializer = numpy_to_record_serializer()


def toProto(event, context):
    try:
        logger.info(event)
        s3Record = event['Records'][0]['s3']
        bucket = s3Record['bucket']['name']
        key = s3Record['object']['key']

        logger.info('Getting original object')
        origin = s3.Object(bucket, key)
        logger.info('Object retrieved')
        logger.info(origin)

        logger.info('Getting tags')
        tagging = client.get_object_tagging(Bucket=bucket, Key=key)
        logger.info('Tags retrieved')
        logger.info(tagging['TagSet'])
예제 #15
0
def main():
    """
    example call
        python3 train_lda.py \
            --pageInputDir pages \
            --vocabFile vocab.pkl \
            --s3Bucket alex9311-sagemaker \
            --s3Prefix LDA-testing \
            --awsRole aws-sagemaker-execution-role
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--vocabFile', action='store', required=True)
    parser.add_argument('--pageInputDir', action='store', required=True)
    parser.add_argument('--s3Bucket', action='store', required=True)
    parser.add_argument('--s3Prefix', action='store', required=True)
    args = parser.parse_args()

    bucket = args.s3Bucket
    prefix = args.s3Prefix
    role = args.awsRole

    vocab = pickle.load(open(args.vocabFile, 'rb'))
    documents = import_documents_on_disk(args.pageInputDir)
    for d in documents:
        d['term_counts'] = document_to_term_counts(d['tokens'], vocab)
    print('length of vocab: ', len(vocab))
    print('number of documents: ', len(documents))

    training_docs = np.array([d['term_counts'] for d in documents])

    # convert training_docs to Protobuf RecordIO format
    recordio_protobuf_serializer = numpy_to_record_serializer()
    training_docs_recordio = recordio_protobuf_serializer(training_docs)

    # upload to S3 in bucket/prefix/train
    fname = 'lda_training.data'
    s3_object = os.path.join(prefix, 'train', fname)
    boto3.Session().resource('s3').Bucket(bucket).Object(
        s3_object).upload_fileobj(training_docs_recordio)
    s3_train_data = 's3://{}/{}'.format(bucket, s3_object)
    print('Uploaded training data to S3: {}'.format(s3_train_data))

    region_name = boto3.Session().region_name
    container = get_image_uri(region_name, 'lda')

    print('Using SageMaker LDA container: {} ({})'.format(
        container, region_name))

    session = sagemaker.Session()

    print('Training input/output will be stored in {}/{}'.format(
        bucket, prefix))
    print('\nIAM Role: {}'.format(role))

    lda = sagemaker.estimator.Estimator(
        container,
        role,
        output_path='s3://{}/{}/output'.format(bucket, prefix),
        train_instance_count=1,
        train_instance_type='ml.m5.large',
        sagemaker_session=session,
    )

    # set algorithm-specific hyperparameters
    lda.set_hyperparameters(
        num_topics=10,
        feature_dim=len(vocab),
        mini_batch_size=len(documents),
        alpha0=1.0,
    )

    # run the training job on input data stored in S3
    lda.fit({'train': s3_train_data})

    training_job_name = lda.latest_training_job.job_name

    print('Training job name: {}'.format(training_job_name))

    model_fname = 'model.tar.gz'
    model_object = os.path.join(prefix, 'output', training_job_name, 'output',
                                model_fname)
    boto3.Session().resource('s3').Bucket(bucket).Object(
        model_object).download_file(fname)
    with tarfile.open(fname) as tar:
        tar.extractall()
    print('Downloaded and extracted model tarball: {}'.format(model_object))

    # obtain the model file
    model_list = [
        fname for fname in os.listdir('.') if fname.startswith('model_')
    ]
    model_fname = model_list[0]
    print('Found model file: {}'.format(model_fname))

    # get the model from the model file and store in Numpy arrays
    alpha, beta = mx.ndarray.load(model_fname)
    learned_alpha_permuted = alpha.asnumpy()
    learned_beta_permuted = beta.asnumpy()

    topic_distributions = learned_beta_permuted.tolist()

    topic_word_weights_list = []
    for topic_distribution in topic_distributions:
        this_topic_word_weights = {}
        for word_index, weight in enumerate(topic_distribution):
            this_topic_word_weights[vocab[word_index]] = weight
        topic_word_weights_list.append(this_topic_word_weights)

    top_words_in_topics = []
    for topic_word_weights in topic_word_weights_list:
        top_words_in_topics.append(
            sorted(topic_word_weights,
                   key=topic_word_weights.get,
                   reverse=True)[:10])
    for index, top_words_in_topic in enumerate(top_words_in_topics):
        print('topic', index)
        for word in top_words_in_topic:
            print('\t', word, ':', topic_word_weights_list[index][word])
 def __init__(self, endpoint, sagemaker_session=None):
     super(RandomCutForestPredictor, self).__init__(endpoint, sagemaker_session,
                                                    serializer=numpy_to_record_serializer(),
                                                    deserializer=record_deserializer())