def test_input_single(self): script = """ x2 = x1 + 1 x3 = x1 + 2 """ script = dml(script).input("x1", 5).output("x2", "x3") self.assertEqual(ml.execute(script).get("x2", "x3"), [6, 7])
def test_matrix_toNumPy(self): script = """ m2 = m1 * 2 """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) script = dml(script).input(m1=rdd1).output("m2") m2 = ml.execute(script).get("m2") self.assertTrue((m2.toNumPy() == np.array([[2.0, 4.0], [6.0, 8.0]])).all())
def test_matrix_toDF(self): sums = """ s1 = sum(m1) m2 = m1 * 2 """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) script = dml(sums).input(m1=rdd1).output("m2") m2 = ml.execute(script).get("m2") self.assertEqual(repr(m2.toDF()), "DataFrame[__INDEX: double, C1: double, C2: double]")
def test_output_list(self): script = """ x1 = 0.2 x2 = x1 + 1 x3 = x1 + 2 """ script = dml(script).output("x1", "x2", "x3") self.assertEqual(ml.execute(script).get("x1", "x2"), [0.2, 1.2]) self.assertEqual(ml.execute(script).get("x1", "x3"), [0.2, 2.2])
def test_output_matrix(self): sums = """ s1 = sum(m1) m2 = m1 * 2 """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) script = dml(sums).input(m1=rdd1).output("s1", "m2") s1, m2 = ml.execute(script).get("s1", "m2") self.assertEqual((s1, repr(m2)), (10.0, "Matrix"))
def test_rdd(self): sums = """ s1 = sum(m1) s2 = sum(m2) s3 = 'whatever' """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) rdd2 = sc.parallelize(["5.0,6.0", "7.0,8.0"]) script = dml(sums).input(m1=rdd1).input(m2=rdd2).output("s1", "s2", "s3") self.assertEqual(ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"])
def dft_systemml(signal, name): prog = dml(dml_script).input('signal', signal).output('DFT') return ( #execute the script inside the SystemML engine running on top of Apache Spark ml.execute(prog) #read result from SystemML execution back as SystemML Matrix .get('DFT') #convert SystemML Matrix to ApacheSpark DataFrame .toDF() #rename default column names .selectExpr('C1 as %sa' % (name), 'C2 as %sb' % (name)) #add unique ID per row for later joining .withColumn("id", monotonically_increasing_id()))
def test_input(self): script = """ x3 = x1 + x2 """ script = dml(script).input(x1=5, x2=3).output("x3") self.assertEqual(ml.execute(script).get("x3"), 8)
def test_output_string(self): script = dml("x1 = 'Hello World'").output("x1") self.assertEqual(ml.execute(script).get("x1"), "Hello World")
# To get consistent results we switch from a random matrix initialization to something deterministic # In[24]: u = np.arange(100000).reshape((100, 1000)) s = np.arange(100000).reshape((1000, 100)) w = np.arange(10000).reshape((100, 100)) # In[25]: prog = dml(script).input('U', u).input('S', s).input('W', w).output('res') prog = dml(script).output('res') res = ml.execute(prog).get('res') print(res) # If everything runs fine you should get *6244089899151.321* as result. Feel free to submit your DML script to the grader now! # # ### Submission # In[26]: get_ipython().system(u'rm -f rklib.py') get_ipython().system(u'wget https://raw.githubusercontent.com/romeokienzler/developerWorks/master/coursera/ai/rklib.py')
# In order to show you the advantage of SystemML over numpy we've blown up the sizes of the matrices. Unfortunately, on a 1-2 worker Spark cluster it takes quite some time to complete. Therefore we've stripped down the example to smaller matrices below, but we've kept the code, just in case you are curious to check it out. But you might want to use some more workers which you easily can configure in the environment settings of the project within Watson Studio. Just be aware that you're currently limited to free 50 capacity unit hours per month wich are consumed by the additional workers. # In[7]: script = """ U = rand(rows=1000,cols=10000, seed=5) S = rand(rows=10000,cols=1000, seed=23) W = rand(rows=1000,cols=1000, seed=42) res = sum(t(U) %*% (W * (U %*% S))) """ # To get consistent results we switch from a random matrix initialization to something deterministic # In[8]: prog = dml(script).output('res') res = ml.execute(prog).get('res') print(res) # If everything runs fine you should get *6252492444241.075* as result (or something in that bullpark). Feel free to submit your DML script to the grader now! # # ### Submission # In[9]: get_ipython().system(u'rm -f rklib.py') get_ipython().system( u'wget https://raw.githubusercontent.com/romeokienzler/developerWorks/master/coursera/ai/rklib.py' ) # In[11]:
import os import numpy as np from pyspark.sql.functions import col, max import systemml # pip3 install systemml from systemml import MLContext, dml from pyspark.context import SparkContext from pyspark.sql import SQLContext sc = SparkContext() sqlContext = SQLContext(sc) ml = MLContext(sc) # train_df = sqlContext.read.load('data/train_256.parquet') val_df = sqlContext.read.load('data/val_256.parquet') X_val = val_df.select("__INDEX", "sample") ml.setStatistics(True).setStatisticsMaxHeavyHitters(30).setExplain(True) script = dml("resnet_prediction_parfor_rowwisecropping.dml").input( X=X_val).output("Y") Y = ml.execute(script).get("Y").toDF() Y.show()
start_w = ceil((Win - Wout) / 2) end_w = start_w + Wout - 1 mask = matrix(0, rows=Hin, cols=Win) temp_mask = matrix(1, rows=Hout, cols=Wout) mask[start_h:end_h, start_w:end_w] = temp_mask mask = matrix(mask, rows=1, cols=Hin*Win) mask = cbind(cbind(mask, mask), mask) out = removeEmpty(target=(input+1), margin="cols", select=mask) - 1 } X = crop_rgb(X, 256, 256, 224, 224) # Scale images to [-1,1] X = X / 255 X = X * 2 - 1 # One-hot encode the labels num_tumor_classes = 3 n = nrow(y) Y = table(seq(1, n), y, n, num_tumor_classes) """ outputs = ("X", "Y") script = dml(script).input(X=X_df, y=y_df).output(*outputs) X, Y = ml.execute(script).get(*outputs) return X, Y X_val, Y_val = preprocess(val_df) ml.setStatistics(True).setStatisticsMaxHeavyHitters(30).setExplain(True) script = dml("resnet_prediction_parfor.dml").input(X=X_val).output("Y") Y = ml.execute(script).get("Y").toDF() Y.show()