def get_minst(data_type="train"): (images, labels) = mnist.read_data_sets("/tmp/mnist/", data_type) images = sc.parallelize(images) labels = sc.parallelize(labels) # Target start from 1 in BigDL record = images.zip(labels).map( lambda (features, label): Sample.from_ndarray(features, label + 1)) return record
def get_mnist(sc, mnist_path): # target is start from 0, (train_images, train_labels) = mnist.read_data_sets(mnist_path, "train") (test_images, test_labels) = mnist.read_data_sets(mnist_path, "test") training_mean = np.mean(train_images) training_std = np.std(train_images) rdd_train_images = sc.parallelize(train_images) rdd_train_labels = sc.parallelize(train_labels) rdd_test_images = sc.parallelize(test_images) rdd_test_labels = sc.parallelize(test_labels) rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map( lambda (features, label): Sample.from_ndarray( (features - training_mean) / training_std, label + 1)) rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map( lambda (features, label): Sample.from_ndarray( (features - training_mean) / training_std, label + 1)) return (rdd_train_sample, rdd_test_sample)
def get_minst(sc, data_type="train", location="/tmp/mnist"): """ Get and normalize the mnist data. We would download it automatically if the data doesn't present at the specific location. :param sc: SparkContext :param data_type: training data or testing data :param location: Location storing the mnist :return: A RDD of Sample """ (images, labels) = mnist.read_data_sets(location, data_type) images = sc.parallelize(images) labels = sc.parallelize(labels) # Target start from 1 in BigDL record = images.zip(labels).map(lambda features_label: Sample.from_ndarray( features_label[0], features_label[1] + 1)) return record
def get_minst(sc, data_type="train", location="/tmp/mnist"): """ Get and normalize the mnist data. We would download it automatically if the data doesn't present at the specific location. :param sc: SparkContext :param data_type: training data or testing data :param location: Location storing the mnist :return: A RDD of Sample """ (images, labels) = mnist.read_data_sets(location, data_type) images = sc.parallelize(images) labels = sc.parallelize(labels) # Target start from 1 in BigDL record = images.zip(labels).map(lambda (features, label): Sample.from_ndarray(features, label + 1)) return record
criterion=MSECriterion(), optim_method="Adagrad", state=state, end_trigger=MaxEpoch(2), batch_size=batch_size) app_name = 'autoencoder-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S") train_summary = TrainSummary(log_dir='/tmp/bigdl_summaries', app_name=app_name) optimizer.set_train_summary(train_summary) print "saving logs to ", app_name # Boot training process trained_model = optimizer.optimize() print "Optimization Done." loss = np.array(train_summary.read_scalar("Loss")) lr = np.array(train_summary.read_scalar("LearningRate")) plt.figure(figsize=(12, 12)) plt.plot(loss[:, 0], loss[:, 1], label='loss') plt.xlim(0, loss.shape[0] + 10) plt.grid(True) plt.title("loss") (images, labels) = mnist.read_data_sets(mnist_path, "test") examples = trained_model.predict(test_data).take(10) f, a = plt.subplots(2, 10, figsize=(10, 2)) for i in range(examples_to_show): a[0][i].imshow(np.reshape(images[i], (28, 28))) a[1][i].imshow(np.reshape(examples[i], (28, 28)))
# As always, a bit of setup import pandas from dataset import mnist from util.common import * init_engine() mnist_path = "datasets/mnist" (train_images, train_labels) = mnist.read_data_sets(mnist_path, "train") (test_images, test_labels) = mnist.read_data_sets(mnist_path, "test") print train_images.shape print train_labels.shape print test_images.shape print test_labels.shape imshow(np.column_stack(train_images[0:10].reshape(10, 28, 28)), cmap='gray') axis('off') print "groud true labels: " print train_labels[0:10] rdd_train_images = sc.parallelize(train_images) rdd_train_labels = sc.parallelize(train_labels) rdd_test_images = sc.parallelize(test_images) rdd_test_labels = sc.parallelize(test_labels) training_mean = np.mean(train_images) training_std = np.std(train_images) rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map( lambda (features, label): Sample.from_ndarray( (features - training_mean) / training_std, label + 1)) rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map( lambda (features, label): Sample.from_ndarray( (features - training_mean) / training_std, label + 1)) print rdd_train_sample.count() print rdd_test_sample.count()