def test_dim(self): linear_data = LinearDataGenerator.generateLinearInput( intercept=0.0, weights=[0.0, 0.0, 0.0], xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33], nPoints=4, seed=0, eps=0.1) self.assertEqual(len(linear_data), 4) for point in linear_data: self.assertEqual(len(point.features), 3) linear_data = LinearDataGenerator.generateLinearRDD( sc=sc, nexamples=6, nfeatures=2, eps=0.1, nParts=2, intercept=0.0).collect() self.assertEqual(len(linear_data), 6) for point in linear_data: self.assertEqual(len(point.features), 2)
def test_dim(self): linear_data = LinearDataGenerator.generateLinearInput( intercept=0.0, weights=[0.0, 0.0, 0.0], xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33], nPoints=4, seed=0, eps=0.1) self.assertEqual(len(linear_data), 4) for point in linear_data: self.assertEqual(len(point.features), 3) linear_data = LinearDataGenerator.generateLinearRDD( sc=self.sc, nexamples=6, nfeatures=2, eps=0.1, nParts=2, intercept=0.0).collect() self.assertEqual(len(linear_data), 6) for point in linear_data: self.assertEqual(len(point.features), 2)
def test_prediction(self): """Test prediction on a model with weights already set.""" # Create a model with initial Weights equal to coefs slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([10.0, 10.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100, 42 + i, 0.1) batches.append( self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(samples), len(batches)) return True # We want all batches to finish for this test. eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: true, predicted = zip(*batch) self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) model_weights = [] input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(len(model_weights), len(batches)) return True # We want all batches to finish for this test. eventually(condition, 90, catch_assertions=True) w = array(model_weights) diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1))
def test_parameter_accuracy(self): """Test that coefs are predicted accurately by fitting on toy data.""" # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients # (10, 10) slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0, 0.0]) xMean = [0.0, 0.0] xVariance = [1.0 / 3.0, 1.0 / 3.0] # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertArrayAlmostEqual(slr.latestModel().weights.array, [10., 10.], 1) self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) return True eventually(condition, catch_assertions=True)
def test_train_prediction(self): """Test that error on test data improves as model is trained.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in batches] errors = [] def func(rdd): true, predicted = zip(*rdd.collect()) errors.append(mean(abs(true) - abs(predicted))) input_stream = self.ssc.queueStream(batches) output_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) output_stream = slr.predictOnValues(output_stream) output_stream.foreachRDD(func) self.ssc.start() def condition(): if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 2) if len(errors) >= 3 and errors[1] - errors[-1] > 2: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition)
def test_prediction(self): """Test prediction on a model with weights already set.""" # Create a model with initial Weights equal to coefs slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([10.0, 10.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100, 42 + i, 0.1) batches.append( self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(samples), len(batches)) return True # We want all batches to finish for this test. self._eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: true, predicted = zip(*batch) self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) model_weights = [] input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(len(model_weights), len(batches)) return True # We want all batches to finish for this test. self._eventually(condition, catch_assertions=True) w = array(model_weights) diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1))
def test_parameter_accuracy(self): """Test that coefs are predicted accurately by fitting on toy data.""" # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients # (10, 10) slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0, 0.0]) xMean = [0.0, 0.0] xVariance = [1.0 / 3.0, 1.0 / 3.0] # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertArrayAlmostEqual( slr.latestModel().weights.array, [10., 10.], 1) self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) return True self._eventually(condition, catch_assertions=True)
# print(""" ___ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ___ # __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ # (______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)\n""") # print("Transpose a numeric matrix") # print("Before transpose:") # print(df_numeric.show()) # print("After transpose:") # print(transposed_df_numeric.show()) # print(""" ___ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ______ ___ # __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ __)(__ # (______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)(______)\n""") # MLib example of linear regression # lp_rdd = df.select('store','Units','UPC','FeatureOnly','DisplayOnly','FeatureDisplay','MultQty','Categoryid','volume','Price','PriU','BasePriu','BasePrice','TrendIndex','CatTrendIndex','RegPriU','RegPrice','WeekOfYear','HLDY_LA_LG','HLDY_LA_HW','HLDY_HA_LG','HLDY_HA_HW','HLDY_TX_LG','HLDY_TX_HW','HLDY_XM_LG','HLDY_XM_HW','HLDY_NY_LG','HLDY_NY_HW','HLDY_SU_LG','HLDY_SU_HW','HLDY_VA_LG','HLDY_VA_HW','HLDY_EA_LG','HLDY_EA_HW','HLDY_ME_LG','HLDY_ME_HW','HLDY_ID_LG','HLDY_ID_HW','LogHolidayIndex','LogWeekofyearIndex','MfrID','BrandID','Lift62','Lift116','Lift119','Lift164','Lift169','Lift301','Lift343','Lift353','Lift363','Lift369','Lift383','Lift401','Lift413','Lift441','Lift443','Lift482','Lift548','Lift570','Lift572','Lift574','Lift578','Lift598','Lift605','Lift725','Lift726','Lift751','Lift838','Lift857','Lift873','Lift1000','AbsPrice62','AbsPrice116','AbsPrice119','AbsPrice164','AbsPrice169','AbsPrice301','AbsPrice343','AbsPrice353','AbsPrice363','AbsPrice369','AbsPrice383','AbsPrice401','AbsPrice413','AbsPrice441','AbsPrice443','AbsPrice482','AbsPrice548','AbsPrice570','AbsPrice572','AbsPrice574','AbsPrice578','AbsPrice598','AbsPrice605','AbsPrice725','AbsPrice726','AbsPrice751','AbsPrice838','AbsPrice857','AbsPrice873','AbsPrice1000','LogVolume','LogUnits','Discount','LogPriceIndex','LogSeason','LogPrice','LogRegPrice','LogBasePrice','LogPriu','LogRegPriu','LogBasePriu','LogSpecialPack','Intercept','ppgid').map(dfToLPRDD) # model = LinearRegressionWithSGD.train(lp_rdd) # # model.save(sc, file_path + "model/lin_reg_model") # pred = lp_rdd.map(lambda p: (p.label, model.predict(p.features))) # print(pred.collect()) print("generateLinearInput") data = LinearDataGenerator.generateLinearInput(0, [1,2,3], [23, 45, 12], [.2, .5, .9], 50, 12314, 1) print(data) print("generateLinearRDD") data = LinearDataGenerator.generateLinearRDD(sc, 50, 10, 1) print(data) # coefficients model