def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True self._eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches ] slr = StreamingLogisticRegressionWithSGD(stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(40)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def get_model(weight, pretrained=True): """ Initiate a streaming model.""" if pretrained: trained_model = _load_pre_trained_model() model = MyStreamingLogisticRegressionWithSGD( trained_model=trained_model) else: model = StreamingLogisticRegressionWithSGD() model.setInitialWeights(weight) return model
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. self._eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def get_model(pretrained=True): ''' Initiate a streaming model. If pretrained=True, init a streaming model with the trained parameters; if not, set initial weight to be all zeros. ''' if (pretrained): trained_model = _load_pre_trained_model() model = MyStreamingLogisticRegressionWithSGD( trained_model=trained_model) else: model = StreamingLogisticRegressionWithSGD() model.setInitialWeights([0.0] * NUM_FEATURES) return model
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) self.ssc.start() def condition(): rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1) return True self._eventually(condition, catch_assertions=True)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1)
if __name__ == '__main__': # Get user input first with open('config/malicious_ips.txt', 'r') as f: for line in f: MALICIOUS_IPS.append(str(line.replace('\n', ''))) # First create the streaming context sc = SparkContext(appName="Realtime Packet Classifier") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, UPDATE_TIMER) # Create the data streams for the training and streaming directory trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine) secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map( processGeneratedLine) testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine) # Create the model and train it on the training data model = StreamingLogisticRegressionWithSGD(numIterations=500) model.setInitialWeights([0 for i in range(75)]) model.trainOn(trainingStream) model.trainOn(secondaryTrainingStream) # Get the model to predict on values incoming in the streaming directory model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\ ).pprint(50) # Start the stream and await manual termination ssc.start() ssc.awaitTermination()
pol_sports = pol_sports.map(p) pol_tech = pol_tech.map(p) pol_ent = pol_ent.map(p) pol_crime = pol_crime.map(p) fin_sports = fin_sports.map(f) fin_tech = fin_tech.map(f) fin_ent = fin_ent.map(f) fin_crime = fin_crime.map(f) sports_tech = sports_tech.map(s) sports_ent = sports_ent.map(s) sports_crime = sports_crime.map(s) tech_ent = tech_ent.map(t) tech_crime = tech_crime.map(t) ent_crime = ent_crime.map(e) model_pol_fin = StreamingLogisticRegressionWithSGD() model_pol_sports = StreamingLogisticRegressionWithSGD() model_pol_tech = StreamingLogisticRegressionWithSGD() model_pol_ent = StreamingLogisticRegressionWithSGD() model_pol_crime = StreamingLogisticRegressionWithSGD() model_fin_sports = StreamingLogisticRegressionWithSGD() model_fin_tech = StreamingLogisticRegressionWithSGD() model_fin_ent = StreamingLogisticRegressionWithSGD() model_fin_crime = StreamingLogisticRegressionWithSGD() model_sports_tech = StreamingLogisticRegressionWithSGD() model_sports_ent = StreamingLogisticRegressionWithSGD() model_sports_crime = StreamingLogisticRegressionWithSGD() model_tech_ent = StreamingLogisticRegressionWithSGD() model_tech_crime = StreamingLogisticRegressionWithSGD() model_ent_crime = StreamingLogisticRegressionWithSGD()
features_test = test.map( lambda tweet: (filtering(tweet[0].split(" ")), tweet[1])).map( lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1])) #SUM among vectors features_training = features_training.filter(lambda tweet: check_None(tweet[ 0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map( lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_training = features_training.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) features_test = features_test.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) model_2 = StreamingLogisticRegressionWithSGD() model_2.setInitialWeights([0.0] * vectorSize) model_2.trainOn(features_training) # Test predictions = model_2.predictOnValues( features_test.map(lambda tweet: (tweet.label, tweet.features))) # 0 - ITA # 1 - ENG true_eng = predictions.window(test_seconds, 1) \ .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \ .map(lambda prediction: (prediction, 1)) \ .reduceByKey(lambda a, b: a + b).pprint()
else: table.put(row=date, data={'tweet_count:pos': str(data[1])}) connection.close() if __name__ == '__main__': # creating a SparkContext object sc = SparkContext.getOrCreate() # setting the log level to avoid printing logs in the console sc.setLogLevel("WARN") # creating a Spark Streaming Context ssc = StreamingContext(sparkContext=sc, batchDuration=10) # setting up a model lr = StreamingLogisticRegressionWithSGD() # loading the pre-trained parameters parameters = json.load(open('model.json', 'r')) # assigning the pre-trained parameters to the logistic regression lr.setInitialWeights(parameters['weights']) # loading stop words stop_words = load_stopwords() # loading common words common_words = load_common_words() # creating the reference table reference_table = create_hash_table(common_words=common_words, stop_words=stop_words) # opening the stream kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'], kafkaParams={"metadata.broker.list": 'localhost:9092'})
def append_key_to_dictionary(dictionary, key, value): dictionary[key] = value return dictionary def insert_into_table(values, table_name, host, port): pass if __name__ == '__main__': sc = SparkContext(appName='PythonSparkStreamingKafka') sc.setLogLevel("WARN") # avoid printing logs # setting up a model lr = StreamingLogisticRegressionWithSGD() parameters = json.load(open('model.json', 'r')) # lr.setInitialWeights(parameters['weights']) lr = create_logistic_regression_skl(parameters['weights'], parameters['intercept']) stop_words = load_stopwords() common_words = load_common_words() reference_table = create_hash_table(common_words=common_words, stop_words=stop_words) ssc = StreamingContext(sparkContext=sc, batchDuration=2) spark_sql = SQLContext(sparkContext=sc) kafkaStream = KafkaUtils.createDirectStream( ssc=ssc, topics=['trump'],
tech_ent = tech_ent.map(t) tech_crime = tech_crime.map(t) ent_crime = ent_crime.map(e) allrdd = [pol_fin,pol_sports,pol_tech,pol_ent,pol_crime,fin_sports,fin_tech,fin_ent,fin_crime,sports_tech,sports_ent,sports_crime, tech_ent,tech_crime,ent_crime] # Build the model # numFeatures = 3 # model.setInitialWeights([0.0, 0.0, 0.0]) models = [] #incase needed labelsAndPreds = [] df = [] for irdd in allrdd: print(irdd) # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0])) modellr = StreamingLogisticRegressionWithSGD() modellr.trainOn(irdd.map(lambda x: x[0])) print(modellr) models.append(modellr) #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features))) outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features))) labelsAndPreds.append(outputrdd) outputdf = outputrdd.toDF(['label', 'prediction']).toPandas() df.append(outputdf) lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32") for i in range(0,len(allrdd)): lab_count = makePredOVO(df[i],labels_num[i],lab_count) cz,correct = 0,0 parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist()