Python StreamingLogisticRegressionWithSGD.trainOn示例，pyspark.mllib.classification.StreamingLogisticRegressionWithSGD.trainOn Python示例

示例#1

0

显示文件

文件： test_streaming_algorithms.py 项目： imback82/spark-4

    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)

示例#2

0

显示文件

文件： test_streaming_algorithms.py 项目： JingchengDu/spark

    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        self._eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)

示例#3

0

显示文件

文件： test_streaming_algorithms.py 项目： JingchengDu/spark

    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)

示例#4

0

显示文件

    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(40)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)

示例#5

0

显示文件

    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches
        ]

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.01,
                                                 numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)

示例#6

0

显示文件

文件： tests.py 项目： alexbaretta/spark

    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)

示例#7

0

显示文件

    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)

示例#8

0

显示文件

文件： tests.py 项目： alexbaretta/spark

    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)

示例#9

0

显示文件

文件： tests.py 项目： alexbaretta/spark

    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)
        rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
        self.assertAlmostEqual(rel, 0.1, 1)

示例#10

0

显示文件

    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)
        rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
        self.assertAlmostEqual(rel, 0.1, 1)

示例#11

0

显示文件

文件： StreamingLogReg.py 项目： PirayaW/twitter-news-classification-spark

    tech_crime = tech_crime.map(t)
    ent_crime = ent_crime.map(e)

    allrdd = [pol_fin,pol_sports,pol_tech,pol_ent,pol_crime,fin_sports,fin_tech,fin_ent,fin_crime,sports_tech,sports_ent,sports_crime,
              tech_ent,tech_crime,ent_crime]
    # Build the model
    # numFeatures = 3
    # model.setInitialWeights([0.0, 0.0, 0.0])
    models = [] #incase needed
    labelsAndPreds = []
    df = []
    for irdd in allrdd:
        print(irdd)
        # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0]))
        modellr = StreamingLogisticRegressionWithSGD()
        modellr.trainOn(irdd.map(lambda x: x[0]))
        print(modellr)
        models.append(modellr)
        #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features)))
        outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features)))
        labelsAndPreds.append(outputrdd)
        outputdf = outputrdd.toDF(['label', 'prediction']).toPandas()
        df.append(outputdf)

    lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32")
    for i in range(0,len(allrdd)):
        lab_count = makePredOVO(df[i],labels_num[i],lab_count)

    cz,correct = 0,0
    parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist()
    for i in range(0,lab_count.shape[0]):

示例#12

0

显示文件

        lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1]))

#SUM among vectors
features_training = features_training.filter(lambda tweet: check_None(tweet[
    0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1]))
features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map(
    lambda tweet: (media(tweet[0], vectorSize), tweet[1]))

features_training = features_training.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)
features_test = features_test.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)

model_2 = StreamingLogisticRegressionWithSGD()
model_2.setInitialWeights([0.0] * vectorSize)
model_2.trainOn(features_training)

# Test
predictions = model_2.predictOnValues(
    features_test.map(lambda tweet: (tweet.label, tweet.features)))

# 0 - ITA
# 1 - ENG


true_eng = predictions.window(test_seconds, 1) \
   .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \
   .map(lambda prediction: (prediction, 1)) \
   .reduceByKey(lambda a, b: a + b).pprint()

true_ita = predictions.window(test_seconds, 1) \

示例#13

0

显示文件

文件： streamingModel.py 项目： PirayaW/twitter-news-classification-spark

            model_sports_ent.setInitialWeights(model.weights)
        with gzip.open(model_path + 'sports_crime.pkl.gz', 'rb') as g:
            model = cPickle.load(g)
            model_sports_crime.setInitialWeights(model.weights)
        with gzip.open(model_path + 'tech_ent.pkl.gz', 'rb') as g:
            model = cPickle.load(g)
            model_tech_ent.setInitialWeights(model.weights)
        with gzip.open(model_path + 'tech_crime.pkl.gz', 'rb') as g:
            model = cPickle.load(g)
            model_tech_crime.setInitialWeights(model.weights)
        with gzip.open(model_path + 'ent_crime.pkl.gz', 'rb') as g:
            model = cPickle.load(g)
            model_ent_crime.setInitialWeights(model.weights)
    print(model_ent_crime.latestModel().weights)

    model_pol_fin.trainOn(pol_fin.map(lambda x: x[0]))
    model_pol_sports.trainOn(pol_sports.map(lambda x: x[0]))
    model_pol_tech.trainOn(pol_tech.map(lambda x: x[0]))
    model_pol_ent.trainOn(pol_ent.map(lambda x: x[0]))
    model_pol_crime.trainOn(pol_crime.map(lambda x: x[0]))
    model_fin_sports.trainOn(fin_sports.map(lambda x: x[0]))
    model_fin_tech.trainOn(fin_tech.map(lambda x: x[0]))
    model_fin_ent.trainOn(fin_ent.map(lambda x: x[0]))
    model_fin_crime.trainOn(fin_crime.map(lambda x: x[0]))
    model_sports_tech.trainOn(sports_tech.map(lambda x: x[0]))
    model_sports_ent.trainOn(sports_ent.map(lambda x: x[0]))
    model_sports_crime.trainOn(sports_crime.map(lambda x: x[0]))
    model_tech_ent.trainOn(tech_ent.map(lambda x: x[0]))
    model_tech_crime.trainOn(tech_crime.map(lambda x: x[0]))
    model_ent_crime.trainOn(ent_crime.map(lambda x: x[0]))

示例#14

0

显示文件

文件： real_time_analytics.py 项目： moosejaw/ct6045-assignment

if __name__ == '__main__':
    # Get user input first
    with open('config/malicious_ips.txt', 'r') as f:
        for line in f:
            MALICIOUS_IPS.append(str(line.replace('\n', '')))

    # First create the streaming context
    sc = SparkContext(appName="Realtime Packet Classifier")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, UPDATE_TIMER)

    # Create the data streams for the training and streaming directory
    trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine)
    secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map(
        processGeneratedLine)
    testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine)

    # Create the model and train it on the training data
    model = StreamingLogisticRegressionWithSGD(numIterations=500)
    model.setInitialWeights([0 for i in range(75)])
    model.trainOn(trainingStream)
    model.trainOn(secondaryTrainingStream)

    # Get the model to predict on values incoming in the streaming directory
    model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\
        ).pprint(50)

    # Start the stream and await manual termination
    ssc.start()
    ssc.awaitTermination()