Пример #1
0
def main():
    #Reading the test and train  files
    trainData = sc.pickleFile(input + '/Train_data.average/part-00000')
    testData = sc.pickleFile(input + '/Test_data.average/part-00000')
    parsedData = trainData.map(parseInput).filter(
        lambda line: len(line.features) != 0 or len(line.label) != 0)
    parsedTestData = testData.map(parseInput).filter(
        lambda line: len(line.features) != 0 or len(line.label) != 0)
    numTrees = [3, 5, 10]
    bestmaxBins = [5, 10, 15]
    BestError = 1000000

    #Cross validation
    for x in bestmaxBins:
        for y in numTrees:
            (Train_RDD, Valid_RDD) = trainData.randomSplit([1, 2], 10L)
            parsed_input = Train_RDD.map(parseInput).filter(
                lambda line: len(line.features) != 0 or len(line.label) != 0)
            parsed_valid = Valid_RDD.map(parseInput).filter(
                lambda line: len(line.features) != 0 or len(line.label) != 0)
            model = RandomForest.trainRegressor(parsed_input,
                                                categoricalFeaturesInfo={},
                                                numTrees=y,
                                                featureSubsetStrategy="auto",
                                                impurity='variance',
                                                maxDepth=4,
                                                maxBins=x)
            predictions = model.predict(parsed_valid.map(lambda x: x.features))
            labelsAndPredictions = parsed_valid.map(lambda lp: lp.label).zip(
                predictions)
            validationErr = labelsAndPredictions.filter(
                lambda (v, p): v != p).count() / float(parsed_valid.count())
            RMSE = math.sqrt(validationErr)
            if RMSE < BestError:
                BestError = RMSE
                bestmaxBins = x
                bestnumTrees = y
    #Finding Test error
    model = RandomForest.trainRegressor(parsedData,
                                        categoricalFeaturesInfo={},
                                        numTrees=bestnumTrees,
                                        featureSubsetStrategy="auto",
                                        impurity='variance',
                                        maxDepth=4,
                                        maxBins=bestmaxBins)
    predictions = model.predict(parsedTestData.map(lambda x: x.features))
    labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(
        predictions)
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(parsedTestData.count())
    RMSE_test = math.sqrt(testErr)

    #Reporting validation and test error
    print("Best Root Mean Squared Error Validation= " + str(BestError))
    print("Best Root Mean Squared Error Test= " + str(RMSE_test))
Пример #2
0
 def _set_rddModel(self, _type, _SLA, data):
     if _type == 'regression':
         if _SLA == 'randomForest':
             self._rddModel = RandomForest.trainRegressor(
                 data,
                 categoricalFeaturesInfo={},
                 numTrees=int(self.sparkOptions[4]),
                 featureSubsetStrategy=self.sparkOptions[5],
                 impurity='variance',
                 maxDepth=int(self.sparkOptions[1]),
                 maxBins=32)
         else:
             self._rddModel = ""
     else:  #classification
         if _SLA == 'randomForest':
             print self.numClasses
             self._rddModel = RandomForest.trainClassifier(
                 data,
                 numClasses=self.numClasses,
                 categoricalFeaturesInfo={},
                 numTrees=int(self.sparkOptions[4]),
                 maxDepth=int(self.sparkOptions[1]),
                 featureSubsetStrategy=self.sparkOptions[5],
                 impurity=self.sparkOptions[2])
         else:
             self._rddModel = ""
Пример #3
0
def main():
    input_train = sys.argv[1]
    input_test = sys.argv[2]

    conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    train = sc.textFile(input_train).cache()
    test = sc.textFile(input_test).cache()

    '''sbaronia - get training and testing labeled points'''
    train_lp = train.map(to_labeledpoint).cache()
    test_lp = test.map(to_labeledpoint).cache()

    '''sbaronia - run RandomForest regression on our training data with
    default options except numTrees = 5'''
    rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
    
    '''sbaronia - run predictions on testing data and calculate RMSE value'''
    predictions = rf_model.predict(test_lp.map(lambda x: x.features))
    labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
    rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))

    print("RMSE = " + str(rmse))
Пример #4
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
        except ValueError:
            self.fail()
Пример #5
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Пример #6
0
def Regression_Model(filename):
    open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data(
        filename)
    output = []
    for i in range(1, len(Date)):
        tmp = LabeledPoint(label=True_price_train[i],
                           features=[close_price_train[i]])
        output.append(tmp)

    output_train_RDD = sc.parallelize(output).cache()
    lrm = LinearRegressionWithSGD.train(output_train_RDD,
                                        step=0.001,
                                        iterations=100000)
    tree = DecisionTree.trainRegressor(output_train_RDD,
                                       categoricalFeaturesInfo={},
                                       impurity='variance',
                                       maxDepth=5,
                                       maxBins=30)
    forest = RandomForest.trainRegressor(output_train_RDD,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='variance',
                                         maxDepth=5,
                                         maxBins=30)
    gradient = GradientBoostedTrees.trainRegressor(output_train_RDD,
                                                   categoricalFeaturesInfo={},
                                                   numIterations=10)

    print("\n============MODEL Evaluation=============\n")
    model_name = [
        'LinearRegression', 'DecisionTree', 'RandomForest',
        'GradientBoostedTrees'
    ]
    es_modelname = ['lrm', 'tree', 'forest', 'gradient']
    result = ''
    x = 0
    err = 1000
    test_model = 'LinearRegression'
    #此处更换不同的RDD
    output_model_RDD = lrm
    for model in [lrm, tree, forest, gradient]:
        predictions = model.predict(output_train_RDD.map(lambda x: x.features))
        labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip(
            predictions)
        MSE = (
            labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /
            float(output_train_RDD.count()))**0.5
        #print ("Predictions: ", valuesAndPreds.take(10))
        result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n"
        if (err > MSE):
            err = MSE
            output_model = model
            es_model = es_modelname[x]
        x += 1
    print(result)
    print(es_model)
    return Date, True_price, output_model_RDD, open_price, close_price, es_model
Пример #7
0
def trainRandomForestModel(data):
    """
    Train a random forest regression model and return it
    :param data: RDD[LabeledPoint]
    :return: random forest regression model
    """
    from pyspark.mllib.tree import RandomForest
    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    return model
Пример #8
0
    def build_regressors(self, split_dataset, split_kmeans_dataset,
                         feature_keys):
        self.logger.info('building regressors')
        mce_tuples = []
        for dataset, kmeans_dataset in zip(split_dataset,
                                           split_kmeans_dataset):
            kmeans_train_set = []
            for item in kmeans_dataset:
                features = [item[column] for column in feature_keys]
                kmeans_train_set.append(array(features))
            # print "kmeans_train_set", len(kmeans_train_set)
            del kmeans_dataset
            kmeans_train_set = sc.parallelize(kmeans_train_set)
            clusters = KMeans.train(kmeans_train_set,
                                    100,
                                    maxIterations=200,
                                    runs=10,
                                    initializationMode="random")
            del kmeans_train_set
            data = []
            for item in dataset:
                features = []
                for column in feature_keys:
                    features.append(item[column])
                data.append(LabeledPoint(item[self.target_key], features))
            del dataset
            data = sc.parallelize(data)

            def preprocess(observation):
                observation.label = float(observation.label / 10000)
                return observation

            data = data.map(preprocess)
            (trainingData, testData) = data.randomSplit([0.7, 0.3])
            # del data
            model = RandomForest.trainRegressor(
                trainingData,
                categoricalFeaturesInfo={},
                numTrees=self.rfr_config['num_trees'],
                featureSubsetStrategy=self.
                rfr_config['feature_subset_strategy'],  # "all",
                impurity='variance',
                maxDepth=self.rfr_config['max_depth'])
            predictions = model.predict(testData.map(lambda x: x.features))
            labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
                predictions)
            testMSE = -1
            try:
                testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
                    float(testData.count())
            except:
                pass
            mce_tuples.append((model, clusters, testMSE))
        self.logger.info('regressors build finished')
        return mce_tuples
Пример #9
0
def train_model(filename='final_tip_all.txt',
                test_portion=0.2,
                cat_var=cat_var_dic,
                n_tree=250,
                mode_feature_strat='auto',
                max_deep=5,
                max_bin=32):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose

    sc = SparkContext()

    sqlContext = SQLContext(sc)

    spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate()

    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, filename)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData,
     testData) = data.randomSplit([1 - test_portion, test_portion])

    ##### TREAT TEMP AS CONTINUOUS ####
    model = RandomForest.trainRegressor(
        trainingData,
        categoricalFeaturesInfo=cat_var,
        numTrees=n_tree,
        featureSubsetStrategy=mode_feature_strat,
        impurity='variance',
        maxDepth=max_deep,
        maxBins=max_bin)

    ############ prediction !!!! ####
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(testData.count())
    testRMSE = math.sqrt(testMSE)

    #predictions.takeSample(withReplacement = False, num = 5)
    # convert the rdd object to dataframe as follows
    df_predictions = predictions.map(lambda x: (x, )).toDF()
    df_predictions.cache()
    #df_predictions.show(5, False)

    #print('Learned regression forest model:')
    #print(model.toDebugString())
    print('Test Root Mean Squared Error on ' + filename + ' = ' +
          str(testRMSE))
Пример #10
0
def getRandomForestRMSE(trees_array):
	valRMSE_list = []
	for trees in trees_array:
		model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
                                    numTrees=trees, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)
		predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
		labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
		valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
		valRMSE=valMSE**0.5
		valRMSE_list.append((trees, valRMSE))
	return valRMSE_list
Пример #11
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #12
0
def testRegression(trainingData, testData):    
	# Train a RandomForest model.    
	#  Empty categoricalFeaturesInfo indicates all features are continuous.    
	#  Note: Use larger numTrees in practice.    
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.    
	model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},numTrees=3, featureSubsetStrategy="auto",impurity='variance', maxDepth=4, maxBins=32)
	# Evaluate model on test instances and compute test error    
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum() / float(testData.count())
	print('Test Mean Squared Error = ' + str(testMSE))
	print('Learned regression forest model:')
	print(model.toDebugString())
Пример #13
0
 def train(self):
     """
     Trains the Random Forest model with the optimal parameters.
     @return: The trained RF model
     """
     target_test = self._test_data.map(lambda p: p.label)
     hyper_params = self.find_rf_parameters()
     rf_model = RandomForest.trainRegressor(self._train_data,
                                            categoricalFeaturesInfo={},
                                            numTrees=hyper_params['trees'],
                                            featureSubsetStrategy="auto",
                                            impurity="variance",
                                            maxDepth=hyper_params['depth'],
                                            maxBins=54)
     return rf_model
Пример #14
0
def trainRandomForestModel(data):
    """
    Train a random forest regression model and return it
    :param data: RDD[LabeledPoint]
    :return: random forest regression model
    """
    from pyspark.mllib.tree import RandomForest
    model = RandomForest.trainRegressor(data,
                                        categoricalFeaturesInfo={},
                                        numTrees=2000,
                                        featureSubsetStrategy="auto",
                                        impurity="variance",
                                        maxDepth=4,
                                        maxBins=32)
    return model
Пример #15
0
def testRegression(trainingData, testData):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
        .sum() / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
    print(model.toDebugString())
def mllib_rf_regress(lp_train_rdd, lp_test_rdd, trees, depth, bins):
    ''' RandomForest Regression
    takes in train/test LabeledPoint rdds
    '''
    model = RandomForest.trainRegressor(lp_train_rdd,
                                        categoricalFeaturesInfo={},
                                        numTrees=trees,
                                        featureSubsetStrategy="auto",
                                        impurity='variance',
                                        maxDepth=depth,
                                        maxBins=bins)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(lp_test_rdd.map(lambda x: x.features))
    labelsAndPredictions = lp_test_rdd.map(lambda lp: lp.label).zip(
        predictions)
    test_error = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) *
                                          (lp[0] - lp[1])).sum() / float(
                                              lp_test_rdd.count())
    return test_error
Пример #17
0
    def find_rf_parameters(self):
        """
        Iterates through a set of numbers corresponding to numTrees and maxDepth to
        search for the optimal hyperparameters.
        @return: The best hyperparameters, that minimise MSE
        """
        min_error = 99999999
        num_trees = self._trees[0]
        depth = self._depths[0]
        for i in self._trees:
            for j in self._depths:
                rf_model = RandomForest.trainRegressor(
                    self._train_data,
                    categoricalFeaturesInfo={
                        3: 153,
                        4: 4,
                        5: 80
                    },
                    numTrees=i,
                    featureSubsetStrategy="auto",
                    impurity="variance",
                    maxDepth=j,
                    maxBins=54)
                predictions = rf_model.predict(
                    self._train_data.map(lambda x: x.features))
                target_train = self._train_data.map(lambda p: p.label)
                rf_values = target_train.zip(
                    predictions.map(lambda x: float(x)))
                metrics_rf = RegressionMetrics(rf_values)
                mse = metrics_rf.meanSquaredError
                if (mse < min_error):
                    min_error = mse
                    num_trees = i
                    depth = j

        self._log.info('Estimating Parameters for Random Forests:\n=====')
        self._log.info('MSE = {}, trees = {}, depth = {}'.format(
            min_error, num_trees, depth))
        return {'trees': num_trees, 'depth': depth}
    def train_amount_model(self, model, data, i):
        rdd_data = self.sc.parallelize(data)
        self.logger.info('Start to train the amount model')
        if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
            input_num = self.feature_num
            layers = [input_num, input_num / 3 * 2, input_num / 3, 1]

            neural_network = NeuralNetworkSpark(layers=layers, bias=0)
            model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
                                         iteration=15, model=model)
        elif self.amount_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
                                                featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
                                                maxBins=32)

        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                  initialWeights=model.weights if model is not None else None)

        else:
            self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
            raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
        return model
     features_modeled_train, features_categorical_indexed_vec_train)
 ## select the one-hot-encoded categorical features along with numerical features as well as label to contrust the modeling dataset
 df_train_modeling = df_train.select(features_modeled_train)
 ## df_train_modeling_rdd for mllib package
 df_train_modeling_rdd = df_train_modeling.rdd.map(
     lambda p: convert_sparsevec_to_vec_df(
         p, features_categorical_indexed_vec_index_train))
 df_train_modeling_rdd = df_train_modeling_rdd.map(
     lambda l: LabeledPoint(l[0], l[1:]))
 ################################################## 5: train random forest regression model
 ## random forest
 ## train model
 rfModel = RandomForest.trainRegressor(df_train_modeling_rdd,
                                       categoricalFeaturesInfo={},
                                       numTrees=100,
                                       featureSubsetStrategy="auto",
                                       impurity='variance',
                                       maxDepth=10,
                                       maxBins=32)
 # Predict on train data
 predictions = rfModel.predict(
     df_train_modeling_rdd.map(lambda l: l.features))
 ## Evaluation of the model
 predictionAndObservations = predictions.zip(
     df_train_modeling_rdd.map(lambda l: l.label))
 testMetrics = RegressionMetrics(predictionAndObservations)
 model_time = str(model_time[0][0])
 df_model_performance = spark.createDataFrame(
     sc.parallelize(
         [[model_time, testMetrics.rootMeanSquaredError, testMetrics.r2]]),
     ["model_time", "RMSE", "R2"])
    sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local")
    sc=SparkContext(conf=sparkConf)
    sqlContext = SQLContext(sc)

    (visual_training_image_array , visual_training_outcome_array ) = loadVisualTrainingDataToArray()
    #We have to turn it into a list of observations
    visual_training_data = []
    for i in range(0,len(visual_training_outcome_array) ):
        visual_training_data.append((visual_training_outcome_array[i],visual_training_image_array[i]))
    visual_training_rdd = sc.parallelize(visual_training_data)
    visual_data_flattened = visual_training_rdd.map(lambda x : ( x[0] , averageBrightness4By4(x[1])) )
    visual_data_labeled_points = visual_data_flattened.map(lambda x : varsToLabeledPoint(x))
    toprint=visual_data_labeled_points.take(1)
    print(str(toprint))
    visual_model = RandomForest.trainRegressor(visual_data_labeled_points, categoricalFeaturesInfo={},
                                    numTrees=1000, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=100)
    #visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True)

    visual_training_vectors = visual_data_flattened.map(lambda x : featuresToVectors(x[1]))
    toprint = visual_training_vectors.take(1)
    print(str(toprint))
    visual_in_sample_predictions = visual_model.predict(visual_training_vectors)
    visual_in_sample_labels_and_predictions = visual_data_labeled_points.map(lambda lp: lp.label).zip(visual_in_sample_predictions)
    visual_in_sample_labels_and_predictions.foreach(printline)
    squaresdf = visual_in_sample_labels_and_predictions.map(lambda p : (p[0] , p[0]*p[0] , p[0] - p[1] , (p[0] - p[1])*(p[0] - p[1]) , 1 ) )
    squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) )
    tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4])
    rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4])
    r2 = 1-rss/tss
    print("Training set:")
Пример #21
0
def rf(userID, n):
    
    ### CREATING GAME PROFILE DF ####
    game_profiles = get_game_profiles()
    df = pd.DataFrame(game_profiles)
    df_clean = preprocess(df)

    # Full df for games only, no playtimes (for prediction later)
    df_games = df_clean.drop('genres', 1)
    #df_games = df_games.drop('name', 1) 
    df_games = df_games.drop('appID', 1)
    df_games = df_games.drop('cat', 1)
    df_games = df_games.drop('tags', 1)
    df_games = df_games.drop('type', 1)


    games = get_games('/media/sf_AdvancedML/Final/gameData.txt')
    missing = set()

    ### CROSS VALIDATING ###    
    all_accur, avg_accur = cross_validate(df_clean, games, 10)
    print "Accuracies, Average Accuracy"
    print all_accur, avg_accur

    ### TRAIN ON INCOMING USER ###
    ownedGames = build_user_dataset.get_ownedGames(userID) #json object
    with open('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt', 'w') as outFile:
        if len(ownedGames) == 0:
            print "This user's library is empty or unreachable."
            return
        json.dump({'user': userID, 'ownedGames':ownedGames}, outFile)

    # initialize empty frame with appropriate columns
    df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])

    # Randomly select user's library
    gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt')
    user = random.choice(gamesOwned.values())
    gamesList = gamesOwned[gamesOwned.keys()[0]].keys()

    # Connect playtime to game df for games owned
    if len(user.values()) > 0:
        #print user.values()[0]
        for k, v in user.values()[0].iteritems():
            if k in games:
                row = df_clean.loc[df_clean['name'] == k]
                row['playtime'] = np.log(v)
                df = df.append(row)
            else:
                missing.add(k)

    df = df.drop('genres', 1)
    df = df.drop('name', 1)
    df = df.drop('appID', 1)
    df = df.drop('cat', 1)
    df = df.drop('tags', 1)
    df = df.drop('type', 1)

    # Pass User DF to Spark
    df.to_csv('/media/sf_AdvancedML/Final/RF.csv')

    data = sc.textFile('/media/sf_AdvancedML/Final/RF.csv')
    header = data.first()
    data = data.filter(lambda x: x != header)
    data = data.map(lambda line: convertUni(line))
    data = data.map(lambda line: line.split(','))

    # RDD of (label, features) pairs
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))

    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo = {},
                                        numTrees = 3, featureSubsetStrategy = "auto",
                                        impurity = 'variance', maxDepth = 4)

    ### PREDICT ###
    # for every game in Steam library #
    df_games.to_csv('/media/sf_AdvancedML/Final/RF_games_names.csv')
    df_games.drop('name', 1).to_csv('/media/sf_AdvancedML/Final/RF_games.csv')

    data_games = sc.textFile('/media/sf_AdvancedML/Final/RF_games.csv')
    header = data_games.first()
    data_games = data_games.filter(lambda x: x != header)
    data_games = data_games.map(lambda line: convertUni(line))
    data_games = data_games.map(lambda line: line.split(','))

    data_test = sc.textFile('/media/sf_AdvancedML/Final/RF_games_names.csv')
    header2 = data_test.first()
    data_test = data_test.filter(lambda x: x != header2)
    data_test = data_test.map(lambda line: convertUni(line))
    data_test = data_test.map(lambda line: line.split(','))
    
    predictions = model.predict(data_games)
    idPredictions = data_test.map(lambda x: x[6]).zip(predictions)

    # Filter predictions for games owned or trailers/apps
    idPredictions = idPredictions.filter(lambda x: x[0] not in gamesList)

    # Export predictions to pandas df
    predDF = idPredictions.toDF()
    predDF = predDF.toPandas()  # Name, Prediction
    predDF.columns = ['Name', 'PredictedPlaytime']

    # Returning top n not in library
    sorted_predDF = predDF.sort_values(by = 'PredictedPlaytime', ascending = False)
    recs = []
    #while len(recs) <= n:
        # check if rec in library
        #game = 
        # check if game or trailer/app

    return sorted_predDF[:n]
Пример #22
0
    return LabeledPoint(loss, array(line_split[1:len(line_split) - 1]))


train_data_labeled_point = train_data_csv.map(parse_labled_point)
test_data_labeled_point = test_data_csv.map(parse_labled_point)

# ======================= TRAIN MODEL =================================================
t0 = time()
# smaller MSE generally indicates a better estimate
# after tweak round parameters
# FeatureSubsetStrargety=auto => it will help us analyse and choose best algorithm base on dataset
# larger numTrees and maxDepth will be more accurate but it will take long time to train
# so I think 10 wound be balance
model = RandomForest.trainRegressor(train_data_labeled_point,
                                    categoricalFeaturesInfo={},
                                    numTrees=10,
                                    maxDepth=10,
                                    featureSubsetStrategy="auto")
tt = time() - t0
print("RandomForest trained in {} seconds".format(round(tt, 3)))

# ======================= TEMPORARY TEST PREDICT MODEL =================================================
t0 = time()
predictions = model.predict(test_data_labeled_point.map(lambda x: x.features))
labels_preds = test_data_labeled_point.map(lambda x: x.label).zip(predictions)
testMSE = labels_preds.map(lambda lp: (lp[0] - lp[1]) *
                           (lp[0] - lp[1])).sum() / float(
                               test_data_labeled_point.count())
tt = time() - t0
print("Prediction made in {} seconds.".format(round(tt, 3)))
print('Test Mean Squared Error = ' + str(testMSE))
Пример #23
0
# sc.setLogLevel("ERROR")

t1 = datetime.datetime.now()

data = sc.textFile('hdfs://node1:9000/input/checkerboard2x2_train.txt')
data = data.map(lambda _: _.split(' '))
data = data.map(lambda row: LabeledPoint(row[-1], row[:-1]))

#print(data .take(20))

train, test = data.randomSplit([95.0, 5.0])

# training model

reg = RandomForest.trainRegressor(train,
                                  numTrees=100,
                                  categoricalFeaturesInfo={})

t2 = datetime.datetime.now()
time_difference = t2 - t1
time_difference_in_minutes = time_difference / timedelta(minutes=1)
print('Time elapsed = ', time_difference_in_minutes, ' minutes')

predictions = reg.predict(test.map(lambda x: x.features))

# creating RDD of pairs of (true_label, predicted_label)
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)

testMSE = labelsAndPredictions.map(lambda x: (x[0] - x[1])**2).sum() / float(
    test.count())
Пример #24
0
# Data points as LabeledPoints
# (crime count, [beat, week])
predArrayLP = joinedData.map(lambda x: LabeledPoint(x[
    0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]]))

# Split into training and testing set. 70-30 split.
(train, test) = predArrayLP.randomSplit([0.7, 0.3])

# Feature categories :
featuresCat = {0: len(beatsDict), 1: 53}
maxBins = max(len(beatsDict), len(weekDict))

model = RandomForest.trainRegressor(train,
                                    categoricalFeaturesInfo=featuresCat,
                                    numTrees=10,
                                    featureSubsetStrategy="auto",
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=maxBins)

# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
#rschoolCountBeats = schoolCount.map(lambda x: x[0])
predOutput = predictions.collect()
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                   (v - p)).sum() / float(test.count())
print('Test Mean Squared Error = ' + str(testMSE))

### Write output to file ###
with open("predictions.txt", 'wb') as f:
Пример #25
0
        continue
    #print(item[1])
    #print(matched.first())
    labeledArray.append(createLabeledPoint(item, matched.first()))
print("Appended DataSets")

#Convert the array of labelled points into an RDD
dataSet = sc.parallelize(labeledArray)

#Split the data into testing and training
(trainingData, testData) = dataSet.randomSplit([.7, .3])

#Create the Random Forest model using the training data and generate predictions using the test data
modelRF = RandomForest.trainRegressor(trainingData,
                                      categoricalFeaturesInfo={},
                                      numTrees=5,
                                      impurity='variance',
                                      maxDepth=4,
                                      maxBins=32)
predictionsRF = modelRF.predict(testData.map(lambda x: x.features))

#Gradient Boosted Model
modelGB = GradientBoostedTrees.trainRegressor(trainingData,
                                              categoricalFeaturesInfo={},
                                              numIterations=3)
predictionsGB = modelGB.predict(testData.map(lambda x: x.features))

#Linear Regression Model
modelLin = LinearRegressionWithSGD.train(trainingData,
                                         iterations=100,
                                         step=0.00000001)
predictionsLin = modelLin.predict(testData.map(lambda x: x.features))
Пример #26
0
val_data = truetestData.map(lambda line: LabeledPoint(line[7], line[0:7]))

# debug
print(data.take(1))
print(val_data.take(1))

# for holdout validation
(trData, tData) = data.randomSplit([0.7, 0.3])

# random forest training model
mod = RandomForest.trainRegressor(trData,
                                  categoricalFeaturesInfo={
                                      0: 13,
                                      1: 1499,
                                      2: 2
                                  },
                                  numTrees=4,
                                  featureSubsetStrategy="auto",
                                  impurity='variance',
                                  maxDepth=8,
                                  maxBins=1500)

# prediction and evaluation
predictions = mod.predict(tData.map(lambda x: x.features))
pred = mod.predict(val_data.map(lambda x: x.features))
labelsAndPredictions = tData.map(lambda lp: lp.label).zip(predictions)
truePred = val_data.map(lambda lp: lp.label).zip(pred)
metrics = RegressionMetrics(labelsAndPredictions)
met2 = RegressionMetrics(truePred)
# Squared Error
print("Validation MSE = %s" % metrics.meanSquaredError)
Пример #27
0
# Putting data in vector assembler form
assembler_train = VectorAssembler(inputCols=newlist_train,
                                  outputCol="features")

transformed_train = assembler_train.transform(merged_train)

# Creating input dataset in the form of labeled point for training the model
data_train = (transformed_train.select(
    "features",
    "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

# Training the model using Random forest regressor
model_train = RandomForest.trainRegressor(data_train,
                                          categoricalFeaturesInfo={},
                                          numTrees=10,
                                          featureSubsetStrategy="auto",
                                          impurity='variance',
                                          maxDepth=8,
                                          maxBins=32)

########################################################################################################
# PREDICTIONS ON FINAL (TEST) DATASET USING DEVELOPED MODEL 'model_train'
########################################################################################################

# Creating a list of features to be used for predictions
removelist_final = set(['business_id', 'review_id', 'u_review_count'])
newlist_final = [
    v for i, v in enumerate(merged_final_ku_kb.columns)
    if v not in removelist_final
]
Пример #28
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees

        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2]),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4
        )
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1
        )
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4
        )
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32
        )
        with self.assertRaises(Exception):
            GradientBoostedTrees.trainRegressor(
                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1
            )
Пример #29
0
kmeans_features = df_valid.distinct().map(lambda x: np.array([x.song_hotttnesss, x.loudness]))
clusters = KMeans.train(kmeans_features, 4, maxIterations=10, runs=10, initializationMode="random")

from numpy import array

# regression

regression_features = df_valid.distinct().map(lambda x: LabeledPoint(x.song_hotttnesss, [x.loudness]))
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest, RandomForestModel

training_data, test_data = regression_features.randomSplit([0.8, 0.2])

model = RandomForest.trainRegressor(
    training_data,
    categoricalFeaturesInfo={},
    numTrees=3,
    featureSubsetStrategy="auto",
    impurity="variance",
    maxDepth=4,
    maxBins=32,
)

print model.toDebugString()

# prediction error
predictions = model.predict(test_data.map(lambda x: x.features))
labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) ** 2).sum() / float(test_data.count())
print ("Test Mean Squared Error = " + str(testMSE))
Пример #30
0
testvecData = testdata.map(parseVec)
# use map operation to map the first column of each row to be the label, and the rest into a vector, combined they become a tuple called LabelPoint in Spark
testparsedData = testdata.map(parsePoint)


	
# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  numTres is the number of tree used in the model
#  Setting featureSubsetStrategy="auto" lets the algorithm choose what feature each tree use
#  impurity is variance for regression
#  maxDepth is the maximum depth of each tree
model1 = RandomForest.trainRegressor(trainparsedData
									, categoricalFeaturesInfo={}
									, numTrees=1000
									, featureSubsetStrategy="auto"
									, impurity='variance'
									, maxDepth=13
									, maxBins=32)


# evaluate the training error
# first make the prediction and create a new "vector" of all the predictions
trainpredictions = model1.predict(trainparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions)
# use map operation to compute MSE
trainMSE1 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count())

# use the the Statistics library to obtain the variance
summary = Statistics.colStats(trainvecData)
def main():
    text = sc.textFile(inputs)

    nltk_data_path = "[change to your own nltk_data location]"  # maybe changed to the sfu server path
    nltk.data.path.append(nltk_data_path)
    cleaned_review = text.map(clean_reviewf).cache()

    reviews_txt = cleaned_review.map(lambda review: review['reviewText'])
    reviews = cleaned_review.map(lambda review: (review['overall'], review[
        'reviewText'], review['reviewTime'])).cache()
    training_reviews = reviews.filter(
        lambda (rating, review_text, review_date): review_date.tm_year < 2014)
    testing_reviews = reviews.filter(
        lambda (rating, review_text, review_date): review_date.tm_year == 2014)
    training_data = training_reviews.map(
        lambda (rating, review_text, review_date):
        (rating, review_text)).zipWithIndex().cache()
    testing_data = testing_reviews.map(
        lambda (rating, review_text, review_date):
        (rating, review_text)).zipWithIndex().cache()

    training_rating = training_data.map(
        lambda ((rating, review_text), review_index): (review_index, rating))
    training_review_text = training_data.map(lambda (
        (rating, review_text), review_index): (review_index, review_text))
    training_review_text_flat = training_review_text.flatMapValues(myf)
    training_review_text_flat = training_review_text_flat.map(
        lambda (review_index, review_word): (review_word, review_index))

    testing_rating = testing_data.map(
        lambda ((rating, review_text), review_index): (review_index, rating))
    testing_review_text = testing_data.map(lambda (
        (rating, review_text), review_index): (review_index, review_text))
    testing_review_text_flat = testing_review_text.flatMapValues(myf)
    testing_review_text_flat = testing_review_text_flat.map(
        lambda (review_index, review_word): (review_word, review_index))

    word2vec_model = generate_word2vec_model(reviews_txt)
    mv = word2vec_model.getVectors()
    # this step seems redundant but necessary
    mvdct = []
    for k, v in mv.items():
        vec = [f for f in v]
        mvdct.append((k, vec))
    dct_rdd = sc.parallelize(mvdct)

    training_feature_vecs = dct_rdd.join(training_review_text_flat)
    training_vecs = training_feature_vecs.map(lambda (w, (
        feature_vec, review_index)): (review_index, (feature_vec, 1)))
    training_reduce_vecs = training_vecs.reduceByKey(
        lambda v1, v2: (np.sum([v1[0], v2[0]], axis=0), v1[1] + v2[1]))
    training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (
        feature_vec, ct)): (review_index, np.array(feature_vec) / float(ct)))
    training_rating_avgf = training_rating.join(training_avg_vecs)
    training_lps = training_rating_avgf.map(get_lp)

    testing_feature_vecs = dct_rdd.join(testing_review_text_flat)
    testing_vecs = testing_feature_vecs.map(lambda (w, (
        feature_vec, review_index)): (review_index, (feature_vec, 1)))
    testing_reduce_vecs = testing_vecs.reduceByKey(
        lambda v1, v2: (np.sum([v1[0], v2[0]], axis=0), v1[1] + v2[1]))
    testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (
        feature_vec, ct)): (review_index, np.array(feature_vec) / float(ct)))
    testing_rating_avgf = testing_rating.join(testing_avg_vecs)
    testing_lps = testing_rating_avgf.map(get_lp)

    trees_nums = range(2, 10)
    results = []
    for trees_num in trees_nums:
        rf_model = RandomForest.trainRegressor(training_lps,
                                               categoricalFeaturesInfo={},
                                               numTrees=trees_num,
                                               featureSubsetStrategy="auto",
                                               impurity='variance',
                                               maxDepth=10,
                                               maxBins=32)
        predictions = rf_model.predict(testing_lps.map(lambda x: x.features))
        labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(
            predictions)
        MSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(
                                           testing_lps.count())
        RMSE = math.sqrt(MSE)
        result = 'tree nums: ' + str(trees_num) + ', RMSE: ' + str(RMSE)
        results.append(result)

    outdata = sc.parallelize(results)
    outdata.saveAsTextFile(output)
Пример #32
0
target_data = keyed_data.join(keyed_target)
labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(',')))

#map(lambda line: line.split(",")).map(lambda line: tuple((feature for feature in line)))

# Split the data into training and test sets (30% held out for testing)
print("Creating Training and Test Data Split")
(trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.

model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=8, maxBins=32)

# # Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count())
print('Total Accuracy = ' + str(testAccuracy))

# print('Learned regression forest model:')
# print(model.toDebugString())

# # Save and load model
Пример #33
0
def main():
    text = sc.textFile(inputs)

    nltk_data_path = "[change to your own nltk_data location]"  # maybe changed to the sfu server path
    nltk.data.path.append(nltk_data_path)
    cleaned_review = text.map(clean_reviewf).cache()

    reviews_txt = cleaned_review.map(lambda review: review['reviewText'])
    reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache()
    training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014)
    testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014)
    training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()
    testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()

    training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
    training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
    training_review_text_flat = training_review_text.flatMapValues(myf)
    training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))

    testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
    testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
    testing_review_text_flat = testing_review_text.flatMapValues(myf)
    testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))

    word2vec_model = generate_word2vec_model(reviews_txt)
    mv = word2vec_model.getVectors()
    # this step seems redundant but necessary
    mvdct = []
    for k,v in mv.items():
        vec = [f for f in v]
        mvdct.append((k,vec))
    dct_rdd = sc.parallelize(mvdct)

    training_feature_vecs = dct_rdd.join(training_review_text_flat)
    training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
    training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
    training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
    training_rating_avgf = training_rating.join(training_avg_vecs)
    training_lps = training_rating_avgf.map(get_lp)

    testing_feature_vecs = dct_rdd.join(testing_review_text_flat)
    testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
    testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
    testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
    testing_rating_avgf = testing_rating.join(testing_avg_vecs)
    testing_lps = testing_rating_avgf.map(get_lp)

    trees_nums = range(2,10)
    results = []
    for trees_num in trees_nums:
        rf_model = RandomForest.trainRegressor(training_lps, categoricalFeaturesInfo={},
                                        numTrees=trees_num, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=10, maxBins=32)
        predictions = rf_model.predict(testing_lps.map(lambda x: x.features))
        labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions)
        MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count())
        RMSE = math.sqrt(MSE)
        result = 'tree nums: ' + str(trees_num) + ', RMSE: ' + str(RMSE)
        results.append(result)

    outdata = sc.parallelize(results)
    outdata.saveAsTextFile(output)
Пример #34
0
        "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" %
        rmse)
    r2_dt = ecisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="r2")
    print("R Squared (R2) for Decision Tree on test data = %g" %
          r2_dt.evaluate(decisionTree_model_predictions))

    ############################---RANDOM FOREST REGRESSION---##################################

    train_rdd_rf = train_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
    test_rdd_rf = test_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

    RandomForest_model = RandomForest.trainRegressor(
        train_rdd_rf,
        categoricalFeaturesInfo={},
        numTrees=50,
        featureSubsetStrategy="auto",
        maxDepth=10,
        maxBins=100)

    predictions = RandomForest_model.predict(
        test_rdd_rf.map(lambda x: x.features))
    labelsAndPredictions = test_rdd_rf.map(lambda lp: lp.label).zip(
        predictions)
    metrics = RegressionMetrics(labelsAndPredictions)
    print("RMSE of randomForest on Test data = %s" %
          metrics.rootMeanSquaredError)
    print("R-squared of randomForest on Test data = %s" % metrics.r2)
Пример #35
0
                            .join( avgTemperature ) \
                            .map( lambda row: [ item for sublist in row for item in sublist ] ) \
                            .map( lambda row: LabeledPoint( row[ 2 ][ 1 ], [ row[ 2 ][ 0 ], row[ 1 ], row[ 3 ] ] ) ) \
                            .cache( );
 
 crimeCounts.unpersist( );
 
 # Split the crime counts into training and test datasets
 ( training, test ) = joinedData.randomSplit( ( 0.7, 0.3 ) );
 
 # Categorical features dictionary
 featuresInfo = { 0: len( beatsDict ), 1: 53 };
 
 # Train a Random Forest model to predict crimes
 model = RandomForest.trainRegressor( training, categoricalFeaturesInfo = featuresInfo,
                                      numTrees = 5, featureSubsetStrategy = "auto",
                                      impurity = 'variance', maxDepth = 10, maxBins = len( beatsDict ) );
 
 # Measure the model performance on test dataset
 predictions = model.predict( test.map( lambda x: x.features ) ) \
                    .cache( );
 
 meanCrimes = test.map( lambda x: x.label ).mean( );
 labelsAndPredictions = test.map( lambda x:  x.label ).zip( predictions );
 testMSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( ) / float( test.count( ) );
 testSSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( );
 testSST = labelsAndPredictions.map( lambda ( v, p ): ( v - meanCrimes ) * ( v - meanCrimes ) ).sum( );
 
 Rsq = 1 - testSSE / testSST;
 
 #### Predicting crimes for next week ####
Пример #36
0
############# ############# ############# ############# #############
Пример #37
0
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Tokenizer, HashingTF, VectorIndexer

#feature_cols = ["int_"+x for x in char_col_toUse_names] + num_col_toUse_names

string_indexers = [
   StringIndexer(inputCol=x, outputCol="int_{0}".format(x))
   for x in char_col_toUse_names
]

assembler = VectorAssembler(
    inputCols= ["int_"+x for x in char_col_toUse_names] + num_col_toUse_names,
    outputCol="features"
)

pipeline = Pipeline(stages=string_indexers + [assembler])
model = pipeline.fit(taxi_df)
indexed = model.transform(taxi_df)
ml_df = indexed.select(col("Tool Days").cast("int").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features))

training, test = ml_df.randomSplit([0.8, 0.2], seed=0)

rfm = RandomForest.trainRegressor(sc.parallelize(training.collect()), categoricalFeaturesInfo={0:24,1:3,2:4,3:5,4:107}, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=120)

predictions = rfm.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda x: x.label).zip(predictions)

error = 1.0 * labelsAndPredictions.filter(lambda (p, a): a!=0).map(lambda (p, a): abs(p-a)/a).reduce(lambda a, b: a+b) / test.count()
error
    #We have to turn it into a list of observations
    visual_training_data = []
    for i in range(0, len(visual_training_outcome_array)):
        visual_training_data.append(
            (visual_training_outcome_array[i], visual_training_image_array[i]))
    visual_training_rdd = sc.parallelize(visual_training_data)
    visual_data_flattened = visual_training_rdd.map(
        lambda x: (x[0], averageBrightness4By4(x[1])))
    visual_data_labeled_points = visual_data_flattened.map(
        lambda x: varsToLabeledPoint(x))
    toprint = visual_data_labeled_points.take(1)
    print(str(toprint))
    visual_model = RandomForest.trainRegressor(visual_data_labeled_points,
                                               categoricalFeaturesInfo={},
                                               numTrees=1000,
                                               featureSubsetStrategy="auto",
                                               impurity='variance',
                                               maxDepth=5,
                                               maxBins=100)
    #visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True)

    visual_training_vectors = visual_data_flattened.map(
        lambda x: featuresToVectors(x[1]))
    toprint = visual_training_vectors.take(1)
    print(str(toprint))
    visual_in_sample_predictions = visual_model.predict(
        visual_training_vectors)
    visual_in_sample_labels_and_predictions = visual_data_labeled_points.map(
        lambda lp: lp.label).zip(visual_in_sample_predictions)
    visual_in_sample_labels_and_predictions.foreach(printline)
    squaresdf = visual_in_sample_labels_and_predictions.map(lambda p: (p[0], p[
Пример #39
0
from pyspark.context import SparkContext
from pyspark.mllib.util import MLUtils
from pyspark.mllib.tree import RandomForest, RandomForestModel

sc = SparkContext('yarn', 'weather_predictor')

data = MLUtils.loadLibSVMFile(sc,
                              'hdfs:///users/wfvining/'+sys.argv[1])

(train, test) = data.randomSplit([0.7, 0.3])

model = RandomForest.trainRegressor(trainData,
                                    categoricalFeaturesInfo={x:2
                                                             for x
                                                             in range(654, 615)},
                                    numTrees=10, featureSubsetStrategy='auto',
                                    maxDepth=5)
predictions = model.predict(test.map(lambda x:x.features))
labelsAndPredictions = test.map(lambda lp:lp.label).zip(predictions)
testErr = labelsAndPredictions.map(
    lambda (v, p): (v - p) * (v - p)).sum() / float(test.count())
print('Mean Squared Error: ' + str(testErr)e

Пример #40
0
def cross_validate(df_clean, games, n):
    """
    :param k n: number of users for CV
    :return: list of accuracies for each of n users, avg acc
    """
    missing = set()

    ### COLLECTING LIBRARIES ###
    gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData.txt')
    print "Done collecting ownedGames."

    ### VALIDATING ###
    all_accur = {'model1': [], 'model2': [], 'model3': [], 'model4': []}

    for i in range(n):
        # initialize empty frame with appropriate columns
        df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])

        # Randomly select user's library
        user = random.choice(gamesOwned.values())

        # Connect playtime to game df for games owned
        if len(user.values()) > 0:
            #print user.values()[0]
            for k, v in user.values()[0].iteritems():
                if k in games:
                    row = df_clean.loc[df_clean['name'] == k]
                    row['playtime'] = np.log(v)
                    df = df.append(row)
                else:
                    missing.add(k)

        df = df.drop('genres', 1)
        df = df.drop('name', 1)
        df = df.drop('appID', 1)
        df = df.drop('cat', 1)
        df = df.drop('tags', 1)
        df = df.drop('type', 1)

        # Pass User DF to Spark
        df.to_csv('/media/sf_AdvancedML/Final/RF_train.csv')

        data = sc.textFile('/media/sf_AdvancedML/Final/RF_train.csv')
        header = data.first()
        data = data.filter(lambda x: x != header)
        data = data.map(lambda line: convertUni(line))
        data = data.map(lambda line: line.split(','))

        # RDD of (label, features) pairs
        data = data.map(lambda line: LabeledPoint(line[-1], line[:len(line)]))

        # Split into training, test
        (trainingData, testData) = data.randomSplit([0.8, 0.2])

        try:
            # Training model
            model1 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 70, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model2 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 100, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model3 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 120, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model4 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 100, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 6)

            models = [model1, model2, model3, model4]
            modelNames = ['model1', 'model2', 'model3', 'model4']
            for i in range(len(models)):
                m = models[i]
                name = modelNames[i]
                # Evaluate on test data, compute error
                predictions = m.predict(testData.map(lambda x: x.features))
                labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
                testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)*(v-p)).sum() /\
                          float(testData.count())

                all_accur[name] += [testMSE]

        except:
            pass

    avgDict = {}
    for k,v in all_accur.iteritems():
        avgDict[k] = np.mean(v)
    return all_accur, avgDict
sc = spark.sparkContext

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/diamonds_price.data')

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=123)

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainRegressor(trainingData,
                                    categoricalFeaturesInfo={},
                                    numTrees=25,
                                    featureSubsetStrategy="auto",
                                    impurity='variance',
                                    maxDepth=20,
                                    maxBins=32,
                                    seed=123)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

testRMSE = math.sqrt(
    labelsAndPredictions.map(lambda lp: (lp[0] - lp[1])**2).sum() /
    float(testData.count()))

result = testData.zip(predictions).collect()

# Print the predictions to output file
Пример #42
0
    inputCols=["tf_idf", "gilded", "distinguished", "controversiality"],
    outputCol="features")
mergedDF = assembler.transform(mergedDF)
mergedDF.show()
scoreFeaturesPair = mergedDF.map(lambda x: (x[7], x[0])).repartition(500)
features = scoreFeaturesPair.map(lambda x: x[0])
scores = scoreFeaturesPair.map(lambda x: int(x[1]))

zipped_data = (
    scores.zip(features).map(lambda x: LabeledPoint(x[0], x[1])).cache())

# Do a random split so we can test our model on non-trained data
training, test = zipped_data.randomSplit([0.7, 0.3])

# Train our model
model = RandomForest.trainRegressor(training, {1048577: 4, 1048578: 2}, 10)
#model = LinearRegressionWithSGD.train(training)

# Use our model to predict
train_preds = (training.map(lambda x: x.label).zip(
    model.predict(training.map(lambda x: x.features))))
test_preds = (test.map(lambda x: x.label).zip(
    model.predict(test.map(lambda x: x.features))))

# Ask PySpark for some metrics on how our model predictions performed
trained_metrics = RegressionMetrics(
    train_preds.map(lambda x: (float(x[1]), x[0])))
test_metrics = RegressionMetrics(test_preds.map(lambda x: (float(x[1]), x[0])))

with open('reSampleResult2.txt', 'w+') as f:
    f.write(str(trained_metrics.explainedVariance) + '\n')
Пример #43
0
filtered_car_data = car_data.map(
    lambda d: [toInteger(d["prc"]), toAge(d["fr"]), toFuel(d["fl"]), toInteger(d["ma"]), d["pk"], d["po"], d["ei"]]
)
filtered_car_data.first()

labeled_car_data = filtered_car_data.map(lambda row: LabeledPoint(row[0], row[1:]))
labeled_car_data.first()
labeled_car_data.collect()


"""
	(3) Run the Random Forest.

"""
model = RandomForest.trainRegressor(
    labeled_car_data, numTrees=750, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5, maxBins=32
)

predictions = model.predict(labeled_car_data.map(lambda x: x.features))
labelsAndPredictions = labeled_car_data.map(lambda lp: [lp.label, lp.features]).zip(predictions)
labelsAndPredictions.first()

model_error = labelsAndPredictions.map(lambda row: (row[1] - row[0][0], row))


"""
	(4) Get the extremes!

		Best & Worst deal.

Пример #44
0
features=rdd.map(lambda t: (t[0],t[1],t[2],t[5],t[6],t[9],t[10],t[11],t[12],t[15],t[16]))
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)                              

#select value we want to predict
#lab = rdd.map(lambda row: row[8])#time
lab = rdd.map(lambda row: row[7])#fare
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

#split into training and testing datasets
trainingData, testingData = transformedData.randomSplit([0.9,0.1],seed=1234)

#do the training and get predictions
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},impurity='variance',numTrees=25, seed=42, maxDepth=8)
predictions = model.predict(testingData.map(lambda x: x.features))
valuesAndPreds = testingData.map(lambda lp: lp.label).zip(predictions)
results = valuesAndPreds.toDF().toPandas()
results.columns = ['truth', 'pred']
results = results[results['truth'] > 0]
truth = np.array(results["truth"].tolist())
pred = np.array(results["pred"].tolist())
diff_fare = 100*(truth - pred)/truth

print 'mean = ' + str(diff_fare.mean())

#R-squared
metrics = RegressionMetrics(valuesAndPreds)
print("R-squared = %s" % metrics.r2)
Пример #45
0
from __future__ import print_function

from pyspark import SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestRegressionExample")
    Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/newborn2013.txt')
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={0:3,1:4,2:2},
                                        numTrees=4, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=12)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
    print(model.toDebugString())

    model.save(sc, "target/tmp/myRandomForestRegressionModel")
    sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestRegressionModel")
# Dictionary mapping each beat to an index. Useful when converting to LabeledPoint. Otherwise converts to numeric.
beatsDict = dict(beatList.zipWithIndex().map(lambda x: (x[0],x[1])).collect())

# Data points as LabeledPoints
# (crime count, [beat, week])
predArrayLP = joinedData.map(lambda x: LabeledPoint(x[0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]]))

# Split into training and testing set. 70-30 split.
(train, test) = predArrayLP.randomSplit([0.7, 0.3])

# Feature categories : 
featuresCat = {0: len(beatsDict), 1: 53}
maxBins = max(len(beatsDict),len(weekDict))

model = RandomForest.trainRegressor(train, categoricalFeaturesInfo=featuresCat,
                                    numTrees=10, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=maxBins)


# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
#rschoolCountBeats = schoolCount.map(lambda x: x[0])
predOutput = predictions.collect()
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(test.count())
print('Test Mean Squared Error = ' + str(testMSE))

### Write output to file ###
with open("predictions.txt", 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(predOutput)
Пример #47
0
def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop != '' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.] * 7
    mSL = [
        "Initial Read", "Calculate IDF", "Partition for M.L.",
        "Create Training Vector", "Train Model", "Apply Model",
        "Prepare Output Data"
    ]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ', inputFile
    print 'inputPartitions ', inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict:", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i.
    t1 = time.time()
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    df1.registerTempTable("df1")
    nIn = df1.count()
    dfn1 = sqlContext.sql(
        "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    dfn1.registerTempTable("dfn1")
    nOut = dfn1.count()
    modelDict = aggregatedComparison.exemplarDict(df1, revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to find in and out of ROI", diff
    print "N in:", nIn, ", N out:", nOut
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data, and sample of out region data
    t1 = time.time()
    #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0)
    #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0)
    #nSignal = float(grouped.count())
    #nBack = float(grouped2.count())
    groupedIn = df1.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    scaleFactor = (10. * nIn) / float(nOut)
    (mlApply,
     groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor])
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    nTotTrain = mlTrain.count()
    mlApply.cache()
    nApply = mlApply.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2 - t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]),
                                             categoricalFeaturesInfo={},
                                             numTrees=100,
                                             featureSubsetStrategy="auto",
                                             impurity="variance",
                                             maxDepth=4,
                                             maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2 - t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2 - t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(False, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 modelDict)
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
Пример #48
0
############# ############# ############# ############# #############
def randomForestRegression(trainingData, testData, trainingSize, testSize):
    '''
  random forest for regression
  '''
    # parameter range
    maxDepthValList = [30]
    maxBinsValList = [16, 24, 32]
    numTreesValList = [10, 20]

    # best parameters
    bestMaxDepthVal = 10
    bestMaxBinsVal = 16
    bestNumTreesVal = 10
    bestTrainingRMSE = 1e10

    for maxDepthVal, maxBinsVal, numTreesVal in itertools.product(
            maxDepthValList, maxBinsValList, numTreesValList):
        model = RandomForest.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={},
                                            numTrees=numTreesVal,
                                            featureSubsetStrategy="auto",
                                            impurity='variance',
                                            maxDepth=maxDepthVal,
                                            maxBins=maxBinsVal)
        predictions = model.predict(trainingData.map(lambda x: x.features))
        ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions)
        trainingRMSE = math.sqrt(
            ValsAndPreds.map(lambda (v, p):
                             (v - p)**2).reduce(lambda x, y: x + y) /
            trainingSize)
        if trainingRMSE:
            if trainingRMSE < bestTrainingRMSE:
                bestMaxDepthVal = maxDepthVal
                bestMaxBinsVal = maxBinsVal
                bestNumTreesVal = numTreesVal
                bestTrainingRMSE = trainingRMSE
        print maxDepthVal, maxBinsVal, numTreesVal, trainingRMSE
    print bestMaxDepthVal, bestMaxBinsVal, bestNumTreesVal, bestTrainingRMSE

    model = RandomForest.trainRegressor(trainingData,
                                        categoricalFeaturesInfo={},
                                        numTrees=bestNumTreesVal,
                                        featureSubsetStrategy="auto",
                                        impurity='variance',
                                        maxDepth=bestMaxDepthVal,
                                        maxBins=bestMaxBinsVal)

    # evaluating the model on training data
    predictions = model.predict(trainingData.map(lambda x: x.features))
    ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions)
    trainingRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / trainingSize)
    print trainingRMSE

    # evaluating the model on test data
    predictions = model.predict(testData.map(lambda x: x.features))
    ValsAndPreds = testData.map(lambda x: x.label).zip(predictions)
    testRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / testSize)
    print testRMSE
Пример #50
0
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop !='' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.]*7
    mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good points:", nGoodTweets
    diff = t2-t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)


    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 =  sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2-t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn  = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10.*nSignal/nBack
    (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2-t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2-t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2-t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
Пример #51
0
joinedData = allCrimeCounts.map(
    lambda row: ((row[0][1]), (row[0][0], row[1]))).join(temperature).map(
        lambda row: ((row[0].weekday(), row[1][0][0], row[1][1]), row[1][0][1])
    ).reduceByKey(lambda x, y: x + y).map(
        lambda row: LabeledPoint(row[1], [row[0][0], row[0][1], row[0][2]]))
print joinedData.top(2)

# Split the crime counts into training and test datasets
(training, test) = joinedData.randomSplit((0.9, 0.1))

# Train a Random Forest model to predict crimes
model = RandomForest.trainRegressor(training,
                                    categoricalFeaturesInfo={
                                        0: 7,
                                        1: len(PCTsDict)
                                    },
                                    numTrees=7,
                                    featureSubsetStrategy="auto",
                                    impurity='variance',
                                    maxDepth=10,
                                    maxBins=len(PCTsDict))

#### Predicting crimes for a day####
PCTsDictInverse = dict((v, k) for k, v in PCTsDict.items())

data = []
for weekday in range(7):
    for tempForecast in range(10, 100, 5):
        # Test dataset for each beat with next week's info
        predictday = sc.parallelize(
            tuple([(weekday, PCT, tempForecast)
                   for PCT in range(len(PCTsDict))]))
Пример #52
0
from __future__ import print_function

# $example on$
from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils


if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestRegxample")
    data = MLUtils.loadLibSVMFile(sc,"file:///home/yl408/yuhao_datasets/phishing")
    #data = spark.read.format("libsvm").load("file:///home/yl408/yuhao_datasets/rcv1_train.binary")
    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='variance', maxDepth=4, maxBins=32)
    model.save(sc, "file:///home/yl408/spark-ml/myrandomForestModel")
Пример #53
0
# Get file paths from arguments
if len(sys.argv) != 4:
  print "Usage: random_forest.py FEATURES_FILE MODEL_FOLDER DISTRICTS_FILE"
  sys.exit()
features_file, model_folder, districts_file = sys.argv[1:]

spark_context, sql_context = create_spark_application("train_random_forest")
data_loader = DataLoader(spark_context, sql_context, features_file)
#for the random forest scaling and onehot-encoding are disabled because they better fit to linear regression models
data_loader.initialize(do_scaling=False, do_onehot=False)

#in the case of decision trees categorical features make more sense
maxBins = 32
categorical_features_info = data_loader.get_categorical_features_info()
if categorical_features_info and max(categorical_features_info.values()) > maxBins:
    maxBins = max(categorical_features_info.values())

# train and store a model for each district in the districts file
for lat, lon in read_districts_file(districts_file):
  print("Training District: %f, %f" % (lat, lon))
  start = time.time()
  model = RandomForest.trainRegressor(data_loader.get_train_data((lat, lon)),
                                      categoricalFeaturesInfo=categorical_features_info,
                                      numTrees=5,
                                      maxDepth=15,
                                      maxBins=maxBins)
  #save the model in the specified model_folder
  model.save(spark_context,
             '%s/model_%s_%s' % (model_folder, str(lat), str(lon)))
  print("Done training district. Took %f s." % (time.time() - start))