def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(40)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True self._eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches ] slr = StreamingLogisticRegressionWithSGD(stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
tech_ent,tech_crime,ent_crime] # Build the model # numFeatures = 3 # model.setInitialWeights([0.0, 0.0, 0.0]) models = [] #incase needed labelsAndPreds = [] df = [] for irdd in allrdd: print(irdd) # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0])) modellr = StreamingLogisticRegressionWithSGD() modellr.trainOn(irdd.map(lambda x: x[0])) print(modellr) models.append(modellr) #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features))) outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features))) labelsAndPreds.append(outputrdd) outputdf = outputrdd.toDF(['label', 'prediction']).toPandas() df.append(outputdf) lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32") for i in range(0,len(allrdd)): lab_count = makePredOVO(df[i],labels_num[i],lab_count) cz,correct = 0,0 parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist() for i in range(0,lab_count.shape[0]): if np.count_nonzero(lab_count[i,])==0: cz += 1 pred_label = "Other" else:
features_training = features_training.filter(lambda tweet: check_None(tweet[ 0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map( lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_training = features_training.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) features_test = features_test.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) model_2 = StreamingLogisticRegressionWithSGD() model_2.setInitialWeights([0.0] * vectorSize) model_2.trainOn(features_training) # Test predictions = model_2.predictOnValues( features_test.map(lambda tweet: (tweet.label, tweet.features))) # 0 - ITA # 1 - ENG true_eng = predictions.window(test_seconds, 1) \ .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \ .map(lambda prediction: (prediction, 1)) \ .reduceByKey(lambda a, b: a + b).pprint() true_ita = predictions.window(test_seconds, 1) \ .filter(lambda prediction: prediction[0] == 0.0 and prediction[1] == 0) \ .map(lambda prediction: (prediction, 1)) \ .reduceByKey(lambda a, b: a + b).pprint()
if __name__ == '__main__': # Get user input first with open('config/malicious_ips.txt', 'r') as f: for line in f: MALICIOUS_IPS.append(str(line.replace('\n', ''))) # First create the streaming context sc = SparkContext(appName="Realtime Packet Classifier") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, UPDATE_TIMER) # Create the data streams for the training and streaming directory trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine) secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map( processGeneratedLine) testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine) # Create the model and train it on the training data model = StreamingLogisticRegressionWithSGD(numIterations=500) model.setInitialWeights([0 for i in range(75)]) model.trainOn(trainingStream) model.trainOn(secondaryTrainingStream) # Get the model to predict on values incoming in the streaming directory model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\ ).pprint(50) # Start the stream and await manual termination ssc.start() ssc.awaitTermination()