def test_invalid_sparse_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = coo_matrix(np.array(array_data))
    label_data = np.array([99, 98, 97, 1000]).astype(np.dtype('float64'))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(ValueError):
            write_spmatrix_to_sparse_tensor(f, array, label_data)
def test_dense_to_sparse():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97]).astype(np.dtype("float64"))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(TypeError):
            write_spmatrix_to_sparse_tensor(f, array, label_data)
def test_dense_to_sparse():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97]).astype(np.dtype('float64'))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(TypeError):
            write_spmatrix_to_sparse_tensor(f, array, label_data)
示例#4
0
def recordize(matrices, dest_bucket, prefix, fname_template='data_part{}.pbr', parts=2):
    """
    Converts a sparse array to RecordIO format and uploads
    it to S3.
    """

    s3 = boto3.client('s3')
    chunk_size = matrices.shape[0] // parts

    print("Uploading data to S3:")
    for i in range(parts):
        buffer = io.BytesIO()
        start = i * chunk_size
        end = (i + 1) * chunk_size
        if i + 1 == parts:
            end = matrices.shape[0]

        # this function converts each sparse matrix to RecordIO protobuf
        smac.write_spmatrix_to_sparse_tensor(array=matrices[start:end], file=buffer, labels=None)
        buffer.seek(0)

        s3.upload_fileobj(buffer,
                        Bucket=dest_bucket,
                        Key=f"{prefix}/data_part_{i}.pbr")
        print(f"    s3://{dest_bucket}/{prefix}/data_part_{i}.pbr")
def test_invalid_sparse_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = coo_matrix(np.array(array_data))
    label_data = np.array([99, 98, 97, 1000]).astype(np.dtype("float64"))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(ValueError):
            write_spmatrix_to_sparse_tensor(f, array, label_data)
def split_convert_upload(sparray,
                         bucket,
                         prefix,
                         fname_template='data_part{}.pbr',
                         n_parts=2):
    import io
    import sagemaker.amazon.common as smac

    chunk_size = sparray.shape[0] // n_parts
    for i in range(n_parts):

        # Calculate start and end indices
        start = i * chunk_size
        end = (i + 1) * chunk_size
        if i + 1 == n_parts:
            end = sparray.shape[0]

        # Convert to record protobuf
        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end],
                                             file=buf,
                                             labels=None)
        buf.seek(0)

        # Upload to s3 location specified by bucket and prefix
        fname = os.path.join(prefix, fname_template.format(i))
        boto3.resource('s3').Bucket(bucket).Object(fname).upload_fileobj(buf)
        print('Uploaded data to s3://{}'.format(os.path.join(bucket, fname)))
示例#7
0
def save_as_protobuf(X, Y, bucket, key):
    """Converts features and predictions matrices to recordio protobuf and
       writes to S3

    Args:
        X:
          2D numpy matrix with features
        Y:
          1D numpy matrix with predictions
        bucket:
          s3 bucket where recordio protobuf file will be staged
        prefix:
          s3 url prefix to stage prepared data to use for training the model
        key:
          protobuf file name to be staged

    Returns:
        s3 url with key to the protobuf data
    """
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}'.format(key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket, obj)
def writeProtobuftoDisk(X, y, fname):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, y)
    buf.seek(0)

    with open(fname, "wb") as f:
        f.write(buf.read())
示例#9
0
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket, obj)
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
示例#11
0
def test_dense_int_write_spmatrix_to_sparse_tensor():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    keys_data = [[0, 1, 2], [0, 1, 2]]
    array = coo_matrix(np.array(array_data).astype(np.dtype('int')))
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array)
        f.seek(0)
        for record_data, expected_data, expected_keys in zip(read_recordio(f), array_data, keys_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected_data
            assert record.features["values"].int32_tensor.keys == expected_keys
            assert record.features["values"].int32_tensor.shape == [len(expected_data)]
def test_dense_int_write_spmatrix_to_sparse_tensor():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    keys_data = [[0, 1, 2], [0, 1, 2]]
    array = coo_matrix(np.array(array_data).astype(np.dtype('int')))
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array)
        f.seek(0)
        for record_data, expected_data, expected_keys in zip(_read_recordio(f), array_data, keys_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected_data
            assert record.features["values"].int32_tensor.keys == expected_keys
            assert record.features["values"].int32_tensor.shape == [len(expected_data)]
def test_dense_float64_spmatrix_to_sparse_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    keys_data = [[0, 1, 2], [0, 1, 2]]
    array = coo_matrix(np.array(array_data).astype("float64"))
    label_data = np.array([99, 98, 97])
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array, label_data)
        f.seek(0)
        for record_data, expected_data, expected_keys, label in zip(
            read_recordio(f), array_data, keys_data, label_data
        ):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].float64_tensor.values == expected_data
            assert record.features["values"].float64_tensor.keys == expected_keys
            assert record.label["values"].int32_tensor.values == [label]
            assert record.features["values"].float64_tensor.shape == [len(expected_data)]
示例#14
0
def convert_to_pbr(sprse_matrix, bucket, prefix, fname_template='emails_part{}.pbr', num_parts=2):

    partition_size = sprse_matrix.shape[0] // num_parts
    for i in range(num_parts):
        # Calculate start and end indices
        begin = i*partition_size
        finish = (i+1)*partition_size
        if i+1 == num_parts:
            finish = sprse_matrix.shape[0]

        # Convert sparse matrix to sparse tensor (record io protobuf) - a format required by NTM algorithm
        # pbr - Amazon Record Protobuf format
        data_bytes = io.BytesIO()
        smamzc.write_spmatrix_to_sparse_tensor(array=sprse_matrix[begin:finish], file=data_bytes, labels=None)
        data_bytes.seek(0)

        # Upload to s3 location specified by bucket and prefix
        file_name = os.path.join(prefix, fname_template.format(i))
        boto3.resource('s3').Bucket(bucket).Object(file_name).upload_fileobj(data_bytes)
def test_sparse_int_write_spmatrix_to_sparse_tensor():
    n = 4
    array_data = [[1.0, 2.0], [10.0, 30.0], [100.0, 200.0, 300.0, 400.0], [1000.0, 2000.0, 3000.0]]
    keys_data = [[0, 1], [1, 2], [0, 1, 2, 3], [0, 2, 3]]

    flatten_data = list(itertools.chain.from_iterable(array_data))
    y_indices = list(itertools.chain.from_iterable(keys_data))
    x_indices = [[i] * len(keys_data[i]) for i in range(len(keys_data))]
    x_indices = list(itertools.chain.from_iterable(x_indices))

    array = coo_matrix((flatten_data, (x_indices, y_indices)), dtype='int')
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array)
        f.seek(0)
        for record_data, expected_data, expected_keys in zip(_read_recordio(f), array_data, keys_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected_data
            assert record.features["values"].int32_tensor.keys == expected_keys
            assert record.features["values"].int32_tensor.shape == [n]
def test_dense_float64_spmatrix_to_sparse_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    keys_data = [[0, 1, 2], [0, 1, 2]]
    array = coo_matrix(np.array(array_data).astype('float64'))
    label_data = np.array([99, 98, 97])
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array, label_data)
        f.seek(0)
        for record_data, expected_data, expected_keys, label in zip(
                _read_recordio(f),
                array_data,
                keys_data,
                label_data
        ):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].float64_tensor.values == expected_data
            assert record.features["values"].float64_tensor.keys == expected_keys
            assert record.label["values"].int32_tensor.values == [label]
            assert record.features["values"].float64_tensor.shape == [len(expected_data)]
示例#17
0
def test_sparse_int_write_spmatrix_to_sparse_tensor():
    n = 4
    array_data = [[1.0, 2.0], [10.0, 30.0], [100.0, 200.0, 300.0, 400.0], [1000.0, 2000.0, 3000.0]]
    keys_data = [[0, 1], [1, 2], [0, 1, 2, 3], [0, 2, 3]]

    flatten_data = list(itertools.chain.from_iterable(array_data))
    y_indices = list(itertools.chain.from_iterable(keys_data))
    x_indices = [[i] * len(keys_data[i]) for i in range(len(keys_data))]
    x_indices = list(itertools.chain.from_iterable(x_indices))

    array = coo_matrix((flatten_data, (x_indices, y_indices)), dtype='int')
    with tempfile.TemporaryFile() as f:
        write_spmatrix_to_sparse_tensor(f, array)
        f.seek(0)
        for record_data, expected_data, expected_keys in zip(read_recordio(f), array_data, keys_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected_data
            assert record.features["values"].int32_tensor.keys == expected_keys
            assert record.features["values"].int32_tensor.shape == [n]
示例#18
0
def save_as_protobuf(X, Y, target):
    """Converts features and predictions matrices to recordio protobuf
    Args:
        X:
          2D numpy matrix with features
        Y:
          1D numpy matrix with predictions
        target:
          protobuf file name to be staged
    """
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)

    f, name = tempfile.mkstemp()
    with open(f, "wb") as fd:
        fd.write(buf.getvalue())
        fd.seek(0)
        if target.fs.name == FileSystems.s3:
            target.fs.put(name, str(target))
        else:
            target.move_from(from_path=name)
示例#19
0
def recordio_serialiser(data):
    buf = io.BytesIO()
    write_spmatrix_to_sparse_tensor(buf, data)
    buf.seek(0)
    return ("application/x-recordio-protobuf", buf)
 # Initialize the sparse matrix
 num_lines = data.shape[0]
 num_columns = len(dictionary)
 token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
 
 print('Filling word matrix, %d lines %d columns ' % (num_lines, num_columns))
 
 # Fill the matrix with word frequencies
 line = 0
 for _, row in data.iterrows():
     add_row_to_matrix(line, row)
     line+=1   # Can't use indexes, as they may be larger than num_lines
 
 # Write the matrix to protobuf
 buf = io.BytesIO()
 smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
 buf.seek(0)
 
 training_output_path = os.path.join('/opt/ml/processing/train/', 'training.protobuf')
 print('Saving training data to {}'.format(training_output_path))
 with open(training_output_path, 'wb') as f:
     f.write(buf.getbuffer())
 
 dictionary_output_path = os.path.join('/opt/ml/processing/train/', 'dictionary.pkl')
 print('Saving dictionary to {}'.format(dictionary_output_path))
 with open(dictionary_output_path, 'wb') as f:
     pickle.dump(dictionary, f)
 
 vocabulary_output_path = os.path.join('/opt/ml/processing/train/', 'vocab.txt')
 with open(vocabulary_output_path, 'w') as f:
     for index in range(0,len(dictionary)):