Пример #1
0
    def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
Пример #3
0
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        self._eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
    def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        self._eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
Пример #6
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches
        ]

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.01,
                                                 numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
Пример #7
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(40)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
Пример #8
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
Пример #9
0
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
Пример #10
0
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
Пример #11
0
    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)
        rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
        self.assertAlmostEqual(rel, 0.1, 1)
Пример #12
0
    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)
        rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
        self.assertAlmostEqual(rel, 0.1, 1)

if __name__ == '__main__':
    # creating a SparkContext object
    sc = SparkContext.getOrCreate()
    # setting the log level to avoid printing logs in the console
    sc.setLogLevel("WARN")
    # creating a Spark Streaming Context
    ssc = StreamingContext(sparkContext=sc, batchDuration=10)

    # setting up a model
    lr = StreamingLogisticRegressionWithSGD()
    # loading the pre-trained parameters
    parameters = json.load(open('model.json', 'r'))
    # assigning the pre-trained parameters to the logistic regression
    lr.setInitialWeights(parameters['weights'])
    # loading stop words
    stop_words = load_stopwords()
    # loading common words
    common_words = load_common_words()
    # creating the reference table
    reference_table = create_hash_table(common_words=common_words, stop_words=stop_words)

    # opening the stream
    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc,
                                                topics=['trump'],
                                                kafkaParams={"metadata.broker.list": 'localhost:9092'})

    # getting only the useful information
    dfs = kafkaStream.map(lambda stream: stream[1].encode('utf-8'))
    # parsing data into a dictionary
Пример #14
0
    lambda tweet: (filtering(tweet[0].split(" ")), tweet[1])).map(
        lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1]))

#SUM among vectors
features_training = features_training.filter(lambda tweet: check_None(tweet[
    0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1]))
features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map(
    lambda tweet: (media(tweet[0], vectorSize), tweet[1]))

features_training = features_training.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)
features_test = features_test.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)

model_2 = StreamingLogisticRegressionWithSGD()
model_2.setInitialWeights([0.0] * vectorSize)
model_2.trainOn(features_training)

# Test
predictions = model_2.predictOnValues(
    features_test.map(lambda tweet: (tweet.label, tweet.features)))

# 0 - ITA
# 1 - ENG


true_eng = predictions.window(test_seconds, 1) \
   .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \
   .map(lambda prediction: (prediction, 1)) \
   .reduceByKey(lambda a, b: a + b).pprint()
    model_pol_tech = StreamingLogisticRegressionWithSGD()
    model_pol_ent = StreamingLogisticRegressionWithSGD()
    model_pol_crime = StreamingLogisticRegressionWithSGD()
    model_fin_sports = StreamingLogisticRegressionWithSGD()
    model_fin_tech = StreamingLogisticRegressionWithSGD()
    model_fin_ent = StreamingLogisticRegressionWithSGD()
    model_fin_crime = StreamingLogisticRegressionWithSGD()
    model_sports_tech = StreamingLogisticRegressionWithSGD()
    model_sports_ent = StreamingLogisticRegressionWithSGD()
    model_sports_crime = StreamingLogisticRegressionWithSGD()
    model_tech_ent = StreamingLogisticRegressionWithSGD()
    model_tech_crime = StreamingLogisticRegressionWithSGD()
    model_ent_crime = StreamingLogisticRegressionWithSGD()

    if clear:
        model_pol_fin.setInitialWeights([0.0] * num_features)
        model_pol_sports.setInitialWeights([0.0] * num_features)
        model_pol_tech.setInitialWeights([0.0] * num_features)
        model_pol_ent.setInitialWeights([0.0] * num_features)
        model_pol_crime.setInitialWeights([0.0] * num_features)
        model_fin_sports.setInitialWeights([0.0] * num_features)
        model_fin_tech.setInitialWeights([0.0] * num_features)
        model_fin_ent.setInitialWeights([0.0] * num_features)
        model_fin_crime.setInitialWeights([0.0] * num_features)
        model_sports_tech.setInitialWeights([0.0] * num_features)
        model_sports_ent.setInitialWeights([0.0] * num_features)
        model_sports_crime.setInitialWeights([0.0] * num_features)
        model_tech_ent.setInitialWeights([0.0] * num_features)
        model_tech_crime.setInitialWeights([0.0] * num_features)
        model_ent_crime.setInitialWeights([0.0] * num_features)
    else:
if __name__ == '__main__':
    # Get user input first
    with open('config/malicious_ips.txt', 'r') as f:
        for line in f:
            MALICIOUS_IPS.append(str(line.replace('\n', '')))

    # First create the streaming context
    sc = SparkContext(appName="Realtime Packet Classifier")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, UPDATE_TIMER)

    # Create the data streams for the training and streaming directory
    trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine)
    secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map(
        processGeneratedLine)
    testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine)

    # Create the model and train it on the training data
    model = StreamingLogisticRegressionWithSGD(numIterations=500)
    model.setInitialWeights([0 for i in range(75)])
    model.trainOn(trainingStream)
    model.trainOn(secondaryTrainingStream)

    # Get the model to predict on values incoming in the streaming directory
    model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\
        ).pprint(50)

    # Start the stream and await manual termination
    ssc.start()
    ssc.awaitTermination()