def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
예제 #2
0
    def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
예제 #3
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(40)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
    def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        self._eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
예제 #5
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches
        ]

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.01,
                                                 numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
예제 #6
0
파일: tests.py 프로젝트: alexbaretta/spark
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
              tech_ent,tech_crime,ent_crime]
    # Build the model
    # numFeatures = 3
    # model.setInitialWeights([0.0, 0.0, 0.0])
    models = [] #incase needed
    labelsAndPreds = []
    df = []
    for irdd in allrdd:
        print(irdd)
        # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0]))
        modellr = StreamingLogisticRegressionWithSGD()
        modellr.trainOn(irdd.map(lambda x: x[0]))
        print(modellr)
        models.append(modellr)
        #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features)))
        outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features)))
        labelsAndPreds.append(outputrdd)
        outputdf = outputrdd.toDF(['label', 'prediction']).toPandas()
        df.append(outputdf)

    lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32")
    for i in range(0,len(allrdd)):
        lab_count = makePredOVO(df[i],labels_num[i],lab_count)

    cz,correct = 0,0
    parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist()
    for i in range(0,lab_count.shape[0]):
        if np.count_nonzero(lab_count[i,])==0:
            cz += 1
            pred_label = "Other"
        else:
예제 #8
0
features_training = features_training.filter(lambda tweet: check_None(tweet[
    0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1]))
features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map(
    lambda tweet: (media(tweet[0], vectorSize), tweet[1]))

features_training = features_training.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)
features_test = features_test.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)

model_2 = StreamingLogisticRegressionWithSGD()
model_2.setInitialWeights([0.0] * vectorSize)
model_2.trainOn(features_training)

# Test
predictions = model_2.predictOnValues(
    features_test.map(lambda tweet: (tweet.label, tweet.features)))

# 0 - ITA
# 1 - ENG


true_eng = predictions.window(test_seconds, 1) \
   .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \
   .map(lambda prediction: (prediction, 1)) \
   .reduceByKey(lambda a, b: a + b).pprint()

true_ita = predictions.window(test_seconds, 1) \
   .filter(lambda prediction: prediction[0] == 0.0 and prediction[1] == 0) \
   .map(lambda prediction: (prediction, 1)) \
   .reduceByKey(lambda a, b: a + b).pprint()
if __name__ == '__main__':
    # Get user input first
    with open('config/malicious_ips.txt', 'r') as f:
        for line in f:
            MALICIOUS_IPS.append(str(line.replace('\n', '')))

    # First create the streaming context
    sc = SparkContext(appName="Realtime Packet Classifier")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, UPDATE_TIMER)

    # Create the data streams for the training and streaming directory
    trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine)
    secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map(
        processGeneratedLine)
    testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine)

    # Create the model and train it on the training data
    model = StreamingLogisticRegressionWithSGD(numIterations=500)
    model.setInitialWeights([0 for i in range(75)])
    model.trainOn(trainingStream)
    model.trainOn(secondaryTrainingStream)

    # Get the model to predict on values incoming in the streaming directory
    model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\
        ).pprint(50)

    # Start the stream and await manual termination
    ssc.start()
    ssc.awaitTermination()