def test_invalid_sparse_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = coo_matrix(np.array(array_data)) label_data = np.array([99, 98, 97, 1000]).astype(np.dtype('float64')) with tempfile.TemporaryFile() as f: with pytest.raises(ValueError): write_spmatrix_to_sparse_tensor(f, array, label_data)
def test_dense_to_sparse(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97]).astype(np.dtype("float64")) with tempfile.TemporaryFile() as f: with pytest.raises(TypeError): write_spmatrix_to_sparse_tensor(f, array, label_data)
def test_dense_to_sparse(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97]).astype(np.dtype('float64')) with tempfile.TemporaryFile() as f: with pytest.raises(TypeError): write_spmatrix_to_sparse_tensor(f, array, label_data)
def recordize(matrices, dest_bucket, prefix, fname_template='data_part{}.pbr', parts=2): """ Converts a sparse array to RecordIO format and uploads it to S3. """ s3 = boto3.client('s3') chunk_size = matrices.shape[0] // parts print("Uploading data to S3:") for i in range(parts): buffer = io.BytesIO() start = i * chunk_size end = (i + 1) * chunk_size if i + 1 == parts: end = matrices.shape[0] # this function converts each sparse matrix to RecordIO protobuf smac.write_spmatrix_to_sparse_tensor(array=matrices[start:end], file=buffer, labels=None) buffer.seek(0) s3.upload_fileobj(buffer, Bucket=dest_bucket, Key=f"{prefix}/data_part_{i}.pbr") print(f" s3://{dest_bucket}/{prefix}/data_part_{i}.pbr")
def test_invalid_sparse_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = coo_matrix(np.array(array_data)) label_data = np.array([99, 98, 97, 1000]).astype(np.dtype("float64")) with tempfile.TemporaryFile() as f: with pytest.raises(ValueError): write_spmatrix_to_sparse_tensor(f, array, label_data)
def split_convert_upload(sparray, bucket, prefix, fname_template='data_part{}.pbr', n_parts=2): import io import sagemaker.amazon.common as smac chunk_size = sparray.shape[0] // n_parts for i in range(n_parts): # Calculate start and end indices start = i * chunk_size end = (i + 1) * chunk_size if i + 1 == n_parts: end = sparray.shape[0] # Convert to record protobuf buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end], file=buf, labels=None) buf.seek(0) # Upload to s3 location specified by bucket and prefix fname = os.path.join(prefix, fname_template.format(i)) boto3.resource('s3').Bucket(bucket).Object(fname).upload_fileobj(buf) print('Uploaded data to s3://{}'.format(os.path.join(bucket, fname)))
def save_as_protobuf(X, Y, bucket, key): """Converts features and predictions matrices to recordio protobuf and writes to S3 Args: X: 2D numpy matrix with features Y: 1D numpy matrix with predictions bucket: s3 bucket where recordio protobuf file will be staged prefix: s3 url prefix to stage prepared data to use for training the model key: protobuf file name to be staged Returns: s3 url with key to the protobuf data """ buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, X, Y) buf.seek(0) obj = '{}'.format(key) boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf) return 's3://{}/{}'.format(bucket, obj)
def writeProtobuftoDisk(X, y, fname): buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, X, y) buf.seek(0) with open(fname, "wb") as f: f.write(buf.read())
def writeDatasetToProtobuf(X, Y, bucket, prefix, key): buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, X, Y) buf.seek(0) obj = '{}/{}'.format(prefix, key) boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf) return 's3://{}/{}'.format(bucket, obj)
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None): buf = io.BytesIO() if d_type == "sparse": smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y) else: smac.write_numpy_to_dense_tensor(buf, X, labels=Y) buf.seek(0) obj = '{}/{}'.format(prefix, key) boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf) return 's3://{}/{}'.format(bucket,obj)
def test_dense_int_write_spmatrix_to_sparse_tensor(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] keys_data = [[0, 1, 2], [0, 1, 2]] array = coo_matrix(np.array(array_data).astype(np.dtype('int'))) with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array) f.seek(0) for record_data, expected_data, expected_keys in zip(read_recordio(f), array_data, keys_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected_data assert record.features["values"].int32_tensor.keys == expected_keys assert record.features["values"].int32_tensor.shape == [len(expected_data)]
def test_dense_int_write_spmatrix_to_sparse_tensor(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] keys_data = [[0, 1, 2], [0, 1, 2]] array = coo_matrix(np.array(array_data).astype(np.dtype('int'))) with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array) f.seek(0) for record_data, expected_data, expected_keys in zip(_read_recordio(f), array_data, keys_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected_data assert record.features["values"].int32_tensor.keys == expected_keys assert record.features["values"].int32_tensor.shape == [len(expected_data)]
def test_dense_float64_spmatrix_to_sparse_label(): array_data = [[1, 2, 3], [10, 20, 3]] keys_data = [[0, 1, 2], [0, 1, 2]] array = coo_matrix(np.array(array_data).astype("float64")) label_data = np.array([99, 98, 97]) with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array, label_data) f.seek(0) for record_data, expected_data, expected_keys, label in zip( read_recordio(f), array_data, keys_data, label_data ): record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == expected_data assert record.features["values"].float64_tensor.keys == expected_keys assert record.label["values"].int32_tensor.values == [label] assert record.features["values"].float64_tensor.shape == [len(expected_data)]
def convert_to_pbr(sprse_matrix, bucket, prefix, fname_template='emails_part{}.pbr', num_parts=2): partition_size = sprse_matrix.shape[0] // num_parts for i in range(num_parts): # Calculate start and end indices begin = i*partition_size finish = (i+1)*partition_size if i+1 == num_parts: finish = sprse_matrix.shape[0] # Convert sparse matrix to sparse tensor (record io protobuf) - a format required by NTM algorithm # pbr - Amazon Record Protobuf format data_bytes = io.BytesIO() smamzc.write_spmatrix_to_sparse_tensor(array=sprse_matrix[begin:finish], file=data_bytes, labels=None) data_bytes.seek(0) # Upload to s3 location specified by bucket and prefix file_name = os.path.join(prefix, fname_template.format(i)) boto3.resource('s3').Bucket(bucket).Object(file_name).upload_fileobj(data_bytes)
def test_sparse_int_write_spmatrix_to_sparse_tensor(): n = 4 array_data = [[1.0, 2.0], [10.0, 30.0], [100.0, 200.0, 300.0, 400.0], [1000.0, 2000.0, 3000.0]] keys_data = [[0, 1], [1, 2], [0, 1, 2, 3], [0, 2, 3]] flatten_data = list(itertools.chain.from_iterable(array_data)) y_indices = list(itertools.chain.from_iterable(keys_data)) x_indices = [[i] * len(keys_data[i]) for i in range(len(keys_data))] x_indices = list(itertools.chain.from_iterable(x_indices)) array = coo_matrix((flatten_data, (x_indices, y_indices)), dtype='int') with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array) f.seek(0) for record_data, expected_data, expected_keys in zip(_read_recordio(f), array_data, keys_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected_data assert record.features["values"].int32_tensor.keys == expected_keys assert record.features["values"].int32_tensor.shape == [n]
def test_dense_float64_spmatrix_to_sparse_label(): array_data = [[1, 2, 3], [10, 20, 3]] keys_data = [[0, 1, 2], [0, 1, 2]] array = coo_matrix(np.array(array_data).astype('float64')) label_data = np.array([99, 98, 97]) with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array, label_data) f.seek(0) for record_data, expected_data, expected_keys, label in zip( _read_recordio(f), array_data, keys_data, label_data ): record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == expected_data assert record.features["values"].float64_tensor.keys == expected_keys assert record.label["values"].int32_tensor.values == [label] assert record.features["values"].float64_tensor.shape == [len(expected_data)]
def test_sparse_int_write_spmatrix_to_sparse_tensor(): n = 4 array_data = [[1.0, 2.0], [10.0, 30.0], [100.0, 200.0, 300.0, 400.0], [1000.0, 2000.0, 3000.0]] keys_data = [[0, 1], [1, 2], [0, 1, 2, 3], [0, 2, 3]] flatten_data = list(itertools.chain.from_iterable(array_data)) y_indices = list(itertools.chain.from_iterable(keys_data)) x_indices = [[i] * len(keys_data[i]) for i in range(len(keys_data))] x_indices = list(itertools.chain.from_iterable(x_indices)) array = coo_matrix((flatten_data, (x_indices, y_indices)), dtype='int') with tempfile.TemporaryFile() as f: write_spmatrix_to_sparse_tensor(f, array) f.seek(0) for record_data, expected_data, expected_keys in zip(read_recordio(f), array_data, keys_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected_data assert record.features["values"].int32_tensor.keys == expected_keys assert record.features["values"].int32_tensor.shape == [n]
def save_as_protobuf(X, Y, target): """Converts features and predictions matrices to recordio protobuf Args: X: 2D numpy matrix with features Y: 1D numpy matrix with predictions target: protobuf file name to be staged """ buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, X, Y) f, name = tempfile.mkstemp() with open(f, "wb") as fd: fd.write(buf.getvalue()) fd.seek(0) if target.fs.name == FileSystems.s3: target.fs.put(name, str(target)) else: target.move_from(from_path=name)
def recordio_serialiser(data): buf = io.BytesIO() write_spmatrix_to_sparse_tensor(buf, data) buf.seek(0) return ("application/x-recordio-protobuf", buf)
# Initialize the sparse matrix num_lines = data.shape[0] num_columns = len(dictionary) token_matrix = lil_matrix((num_lines, num_columns)).astype('float32') print('Filling word matrix, %d lines %d columns ' % (num_lines, num_columns)) # Fill the matrix with word frequencies line = 0 for _, row in data.iterrows(): add_row_to_matrix(line, row) line+=1 # Can't use indexes, as they may be larger than num_lines # Write the matrix to protobuf buf = io.BytesIO() smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None) buf.seek(0) training_output_path = os.path.join('/opt/ml/processing/train/', 'training.protobuf') print('Saving training data to {}'.format(training_output_path)) with open(training_output_path, 'wb') as f: f.write(buf.getbuffer()) dictionary_output_path = os.path.join('/opt/ml/processing/train/', 'dictionary.pkl') print('Saving dictionary to {}'.format(dictionary_output_path)) with open(dictionary_output_path, 'wb') as f: pickle.dump(dictionary, f) vocabulary_output_path = os.path.join('/opt/ml/processing/train/', 'vocab.txt') with open(vocabulary_output_path, 'w') as f: for index in range(0,len(dictionary)):