def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. self._eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(40)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches ] slr = StreamingLogisticRegressionWithSGD(stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1)
tech_crime = tech_crime.map(t) ent_crime = ent_crime.map(e) allrdd = [pol_fin,pol_sports,pol_tech,pol_ent,pol_crime,fin_sports,fin_tech,fin_ent,fin_crime,sports_tech,sports_ent,sports_crime, tech_ent,tech_crime,ent_crime] # Build the model # numFeatures = 3 # model.setInitialWeights([0.0, 0.0, 0.0]) models = [] #incase needed labelsAndPreds = [] df = [] for irdd in allrdd: print(irdd) # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0])) modellr = StreamingLogisticRegressionWithSGD() modellr.trainOn(irdd.map(lambda x: x[0])) print(modellr) models.append(modellr) #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features))) outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features))) labelsAndPreds.append(outputrdd) outputdf = outputrdd.toDF(['label', 'prediction']).toPandas() df.append(outputdf) lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32") for i in range(0,len(allrdd)): lab_count = makePredOVO(df[i],labels_num[i],lab_count) cz,correct = 0,0 parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist() for i in range(0,lab_count.shape[0]):
lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1])) #SUM among vectors features_training = features_training.filter(lambda tweet: check_None(tweet[ 0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map( lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_training = features_training.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) features_test = features_test.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) model_2 = StreamingLogisticRegressionWithSGD() model_2.setInitialWeights([0.0] * vectorSize) model_2.trainOn(features_training) # Test predictions = model_2.predictOnValues( features_test.map(lambda tweet: (tweet.label, tweet.features))) # 0 - ITA # 1 - ENG true_eng = predictions.window(test_seconds, 1) \ .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \ .map(lambda prediction: (prediction, 1)) \ .reduceByKey(lambda a, b: a + b).pprint() true_ita = predictions.window(test_seconds, 1) \
model_sports_ent.setInitialWeights(model.weights) with gzip.open(model_path + 'sports_crime.pkl.gz', 'rb') as g: model = cPickle.load(g) model_sports_crime.setInitialWeights(model.weights) with gzip.open(model_path + 'tech_ent.pkl.gz', 'rb') as g: model = cPickle.load(g) model_tech_ent.setInitialWeights(model.weights) with gzip.open(model_path + 'tech_crime.pkl.gz', 'rb') as g: model = cPickle.load(g) model_tech_crime.setInitialWeights(model.weights) with gzip.open(model_path + 'ent_crime.pkl.gz', 'rb') as g: model = cPickle.load(g) model_ent_crime.setInitialWeights(model.weights) print(model_ent_crime.latestModel().weights) model_pol_fin.trainOn(pol_fin.map(lambda x: x[0])) model_pol_sports.trainOn(pol_sports.map(lambda x: x[0])) model_pol_tech.trainOn(pol_tech.map(lambda x: x[0])) model_pol_ent.trainOn(pol_ent.map(lambda x: x[0])) model_pol_crime.trainOn(pol_crime.map(lambda x: x[0])) model_fin_sports.trainOn(fin_sports.map(lambda x: x[0])) model_fin_tech.trainOn(fin_tech.map(lambda x: x[0])) model_fin_ent.trainOn(fin_ent.map(lambda x: x[0])) model_fin_crime.trainOn(fin_crime.map(lambda x: x[0])) model_sports_tech.trainOn(sports_tech.map(lambda x: x[0])) model_sports_ent.trainOn(sports_ent.map(lambda x: x[0])) model_sports_crime.trainOn(sports_crime.map(lambda x: x[0])) model_tech_ent.trainOn(tech_ent.map(lambda x: x[0])) model_tech_crime.trainOn(tech_crime.map(lambda x: x[0])) model_ent_crime.trainOn(ent_crime.map(lambda x: x[0]))
if __name__ == '__main__': # Get user input first with open('config/malicious_ips.txt', 'r') as f: for line in f: MALICIOUS_IPS.append(str(line.replace('\n', ''))) # First create the streaming context sc = SparkContext(appName="Realtime Packet Classifier") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, UPDATE_TIMER) # Create the data streams for the training and streaming directory trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine) secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map( processGeneratedLine) testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine) # Create the model and train it on the training data model = StreamingLogisticRegressionWithSGD(numIterations=500) model.setInitialWeights([0 for i in range(75)]) model.trainOn(trainingStream) model.trainOn(secondaryTrainingStream) # Get the model to predict on values incoming in the streaming directory model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\ ).pprint(50) # Start the stream and await manual termination ssc.start() ssc.awaitTermination()