def modelWithLogisticRegression(trainingData, validationData):

	##Train the model using Logistic Regression that employs Stochastic Gradient Descent
	##with different sets of parameters (i.e the value of lambda and the learning step size.
	##Return the LR model with best accuracy rate
	
	#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
	regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.]
	bestLRModel = None
	bestAccuracy = 0
	numOfIterations = 200
	visualizationData = []
	
	
	for regularizer in regularizationParamater:

		model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestLRModel = model
				
	return bestLRModel, visualizationData
def main():
    MakePixelFileFromImages("./CarData/TrainImages/*pgm")
    sc = SparkContext(appName="Image Classifier 01")

    p = sc.textFile("pos.csv")
    n = sc.textFile("neg.csv")

    pFeatures = p.map(lambda image: image.split(","))
    nFeatures = n.map(lambda image: image.split(","))

    pExamples = pFeatures.map(lambda features: LabeledPoint(1, features))
    nExamples = nFeatures.map(lambda features: LabeledPoint(0, features))

    data = pExamples.union(nExamples)
    (trainingData, testData) = data.randomSplit([0.7,0.3])

    trainingData.cache()

    model = LogisticRegressionWithSGD.train(trainingData)
    labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features)))
    error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count())

    print("************* RESULTS *******************")
    print("Error Rate: " + str(error_rate))

    pickle.dump(model, open("imageModel.pk1","wb"))

    sc.stop()
Exemplo n.º 3
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Exemplo n.º 4
0
def train_committee(train_features, test_features, size=5):
    committee = []
    attempts = 0
    max_attempts = size * 4
    roc_threshold = 0.7

    test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True))
    test_labeled_pairs = test_pairs_features.map(to_labeled_point)

    while len(committee) < size and attempts < max_attempts:
        attempts += 1

        pairs_features = train_features.map(lambda p: process_batch(p, is_train=True))
        labeled_points = pairs_features.map(to_labeled_point).sample(True, 1)

        model = LogisticRegressionWithSGD.train(labeled_points)
        model.clearThreshold()
        scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label))

        metrics = BinaryClassificationMetrics(scores_and_labels)
        if metrics.areaUnderROC > roc_threshold:
            print(attempts, metrics.areaUnderROC)
            committee.append(model)

    return committee
Exemplo n.º 5
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Exemplo n.º 6
0
def logistic_l2_accuracy(x_train, x_test, regParam):
    # cache data to get reasonable speeds for methods like LogisticRegression and SVM
    xc = x_train.cache()
    # training logistic regression with L2 regularization
    model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2")
    # making prediction on x_test
    yhat  = x_test.map(lambda p: (p.label, model.predict(p.features)))
    # returning accuracy on x_test
    return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
Exemplo n.º 7
0
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
Exemplo n.º 8
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def getLogisticRegressionModel(Train_Data):  
    
    numIters = 10
    stepSize = 10.
    regParam = 1e-6
    regType = 'l2'
    includeIntercept = True
    
    
    return LogisticRegressionWithSGD.train(data = Train_Data,
                                   iterations = numIters,
                                   miniBatchFraction=0.1,
                                   step = stepSize,
                                   regParam = regParam,
                                   regType = regType,
                                   intercept = includeIntercept)
def logisticRegression(trainingRDD, trainingRDDHashed,
                       testRDDHashed, iterations, minibatch, stepsize):
    # Train a Naive Bayes Model
    trainedModel = LogisticRegressionWithSGD.train(
        trainingRDD,
        iterations=iterations,
        miniBatchFraction=minibatch,
        regType="l2",
        intercept=True,
        regParam=0.1,
        step=stepsize)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(
        lambda l_v24: (
            (l_v24[0],
             trainedModel.predict(
                l_v24[1])),
            1)).map(
        lambda x_y25: (
            checkState(
                x_y25[0]),
            x_y25[1])).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v26: (
            (l_v26[0],
             trainedModel.predict(
                l_v26[1])),
            1)).map(
        lambda x_y27: (
            checkState(
                x_y27[0]),
            x_y27[1])).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Logistic Regression')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100)

    labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)])
    Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w',
                encoding='utf-8')
    file.write('INDEX,GENDER\n')
    for data in unseen_parsed_data.collect():
        file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n')
    # print(labels_and_preds.collect())



    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
Exemplo n.º 13
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Exemplo n.º 14
0
all_types = []
for i in [str(i) for i in title.split(",")]:
    schema = all_types.append(StructField(i, StringType(), True))
    schema = StructType(all_types)
from pyspark.sql import Row
from pyspark.mllib.classification import LogisticRegressionWithSGD
from numpy import array
from pyspark.mllib.regression import LabeledPoint

D = 2 ** 24


def helper1(r):
    features = []
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_" + str(i) + fe[i]))) % D)
        target = float(r[-1])
        ID = float(r[0])
        return LabeledPoint(target, features)
    except:
        return LabeledPoint(0.0, [0.0] * 1932)


new_rdd = rdd.filter(lambda i: len(i) == 1934)
df = new_rdd.map(helper1)

model = LogisticRegressionWithSGD.train(df)
df.take(1)
Exemplo n.º 15
0
	splits = parsedData.randomSplit((0.9, 0.1))
	train_set = splits[0]
	train_set.cache()
	test_set = splits[1]
	test_set.cache()
	#NBmodel = NaiveBayes.train(train_set)
	#NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	#findCoveragePercent(NB_socredLabel, 0.4)
	SVMSGDmodel = SVMWithSGD.train(train_set)
	SVMSGDmodel.clearThreshold()
	SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0))
	LRSGDmodel = LogisticRegressionWithSGD.train(train_set)	
	LRSGDmodel.clearThreshold()
	LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0))
	LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set)
	LRLBFGSmodel.clearThreshold()
	LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0))

def getAccumulatedPercentage(socredLabel):
	result = []
	total = socredLabel.sum()
import sys

from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint


def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:  # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:])


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: logistic_regression <file> <iterations>",
              file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonLR")
    points = sc.textFile(sys.argv[1]).map(parsePoint)
    iterations = int(sys.argv[2])
    model = LogisticRegressionWithSGD.train(points, iterations)
    print("Final weights: " + str(model.weights))
    print("Final intercept: " + str(model.intercept))
    sc.stop()
Exemplo n.º 17
0
    def create_model_libsvm(self, data, params):

        numIterations = int(params.get('numIterations', 10))

        return LogisticRegressionWithSGD.train(data, numIterations)
Exemplo n.º 18
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Exemplo n.º 19
0
# remove header
header = ibm_rdd.first()
ibm_data_rdd = ibm_rdd.filter(lambda x: x != header) \
    .map(lambda x: x.split(',')) \
    .map(lambda x: LabeledPoint(x[7],[x[8],x[9]]))

ibm_data_rdd.take(5)

# train and test model for 10 times
lst_score = []
for i in range(10):
    ibm_train_rdd, ibm_test_rdd = ibm_data_rdd.randomSplit([.6, .4])
    lrm = (LogisticRegressionWithSGD.train(ibm_train_rdd,
                                           iterations=100,
                                           step=1.0,
                                           miniBatchFraction=1.0,
                                           initialWeights=None,
                                           regParam=0.01,
                                           regType='l2'))
    lst_predicted = (ibm_test_rdd.map(lambda x: x.features).map(
        lambda x: lrm.predict(x)).collect())
    lst_truth = ibm_test_rdd.map(lambda x: x.label).collect()
    score = metrics.accuracy_score(lst_truth, lst_predicted)
    lst_score.append(score)

print np.mean(lst_score)

#################################################
## demo 3: recommender system using ALS #########
#################################################
Exemplo n.º 20
0
tfVectors = tf.transform(comment)
idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
print(tfIdfVectors.take(3))

#需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型
zip_score_comment = score.zip(tfIdfVectors)
final_data = zip_score_comment.map(lambda line:LabeledPoint(line[0],line[1]))
train_data,test_data = final_data.randomSplit([0.8,0.2],seed =0)
print(train_data.take(1))

time_start = time.time()
print(time_start)
#SVMModel = SVMWithSGD.train(train_data,iterations=100)
lrm = LogisticRegressionWithSGD.train(train_data,iterations=1000)
time_end = time.time()
cost_time =  time_end - time_start
print("spark_lr cost_time:",cost_time)


predictionAndLabels = test_data.map(lambda t:(float(lrm.predict(t.features)),t.label))
print(predictionAndLabels.take(5))

metrics = MulticlassMetrics(predictionAndLabels)
 
print('accuracy:',metrics.accuracy)
print('precision:',metrics.weightedPrecision)
print('recall:',metrics.weightedRecall)
print('FMeasure:',metrics.weightedFMeasure())
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
data = [
    LabeledPoint(0.0, [0.0, 1.0]),
    LabeledPoint(1.0, [1.0, 0.0]),
]
lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
lrm.predict([1.0, 0.0])
lrm.predict([0.0, 1.0])
lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect()
lrm.clearThreshold()
lrm.predict([0.0, 1.0])
Exemplo n.º 22
0
def buildModel(trainrdd):

    model = LogisticRegressionWithSGD.train(trainrdd)
    #model = LinearRegressionWithSGD.train(trainrdd)
    return model
test_labels = test_labels_rdd.collect()
ensemble_test = []
for i in range(0, len(test_labels), 1):
    l1 = [test_labels[i]]
    ensemble_test.append(l1)

train_labels_rdd = train_data.map(lambda p: p.label)
train_labels = train_labels_rdd.collect()
ensemble_train = []
for i in range(0, len(train_labels), 1):
    l1 = [train_labels[i]]
    ensemble_train.append(l1)

# C1
# Build the Model
model = LogisticRegressionWithSGD.train(train_data)

# Predict Labels
c1_predict_labels_test_rdd = test_data.map(lambda p:
                                           (model.predict(p.features)))
c1_predict_labels_train_rdd = train_data.map(lambda p:
                                             (model.predict(p.features)))
c1_predict_labels_test = c1_predict_labels_test_rdd.collect()
c1_predict_labels_train = c1_predict_labels_train_rdd.collect()

# Append Labels
appendColumn(ensemble_test, c1_predict_labels_test)
appendColumn(ensemble_train, c1_predict_labels_train)

# C2
# Build the Model
Exemplo n.º 24
0
import numpy as np

from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

conf = SparkConf().setMaster("local").setAppName("Test")

sc = SparkContext(conf=conf)
sparse_data = [
    LabeledPoint(0.0, Vectors.dense([1.0, 0.0])),
    LabeledPoint(1.0, Vectors.dense([0.0, 1.0])),
    LabeledPoint(0.0, Vectors.dense([10.0, 9.0])),
    LabeledPoint(1.0, Vectors.dense([9.0, 10.0]))
]
sparse_data = [
    LabeledPoint(0.0, Vectors.dense([1.0, 0.0])),
    LabeledPoint(1.0, Vectors.dense([0.0, 1.0])),
    LabeledPoint(0.0, Vectors.dense([10.0, 9.0])),
    LabeledPoint(1.0, Vectors.dense([9.0, 10.0]))
]
rdd = sc.parallelize(sparse_data)
model = LogisticRegressionWithSGD.train(rdd, iterations=10)
rdd = rdd.map(lambda x:x.features)
model.predict(rdd).saveAsTextFile("result/hdfs")
sc.stop()
vectorize_start = time.time()
vectorized_data = training_data.map(mapper_CF)
vectorized_testing_data = testing_data.map(mapper_CF)
"""
train_instances = vectorized_data.count()
test_instances = vectorized_testing_data.count()
total_instances = train_instances + test_instances
train_per = float(train_instances)/total_instances * 100
test_per = float(test_instances)/total_instances * 100
#"""
vectorize_end = time.time()
print "******************VECTORIZING: DONE********************"

#building a logistic regression training model
train_start = time.time()
model = LogisticRegressionWithSGD.train(vectorized_data)
train_end = time.time()
print "******************MODEL TRAINING: DONE********************"

#predicting classes for testing data and evaluating
def mapper_predict(x):
    predicted_class = model.predict(x.features)
    #predicted_class = int(round(predicted_class))
    actual_class = x.label
    return (actual_class, predicted_class)

pred_start = time.time()
actual_and_predicted = vectorized_testing_data.map(mapper_predict)
count = actual_and_predicted.count()
pred_end = time.time()
print "******************PREDICTION: DONE********************"
Exemplo n.º 26
0
def spark_create_model(data_size, file_path, store=False):
    """
        Spark Model Creation
    """
    # Set this variable to distinguish between logistic and linear regression
    REGRESSION_TYPE = 'logistic'

    sc = SparkContext(appName="SparkCreateModel")

    # load Twitter data
    if data_size == 'small':
        twitter_data = load_data_from_file(
            sc, "file:///root/mongoData/small_twitter.json")
    else:
        twitter_data = load_data_from_file(
            sc, "file:///root/mongoData/twitter.json")

    # load YouTube data
    if data_size == 'small':
        youtube_data = load_data_from_file(
            sc, "file:///root/mongoData/small_youtube.json")
    else:
        youtube_data = load_data_from_file(
            sc, "file:///root/mongoData/youtube.json")
    youtube_data = youtube_data.filter(filter_youtube_data)

    # load Facebook data
    if data_size == 'small':
        facebook_data = load_data_from_file(
            sc, "file:///root/mongoData/small_facebook.json")
    else:
        facebook_data = load_data_from_file(
            sc, "file:///root/mongoData/facebook.json")

    # Store the sentiment score for each data item
    sent_twitter_data = twitter_data.map(lambda x: get_sentiment(x, 'twitter'))
    sent_youtube_data = youtube_data.map(lambda x: get_sentiment(x, 'youtube'))
    sent_facebook_data = facebook_data.map(
        lambda x: get_sentiment(x, 'facebook'))

    #create MLLib LabeledPoints
    twitter_LP = sent_twitter_data.map(
        lambda x: create_labeled_points_twitter(x, REGRESSION_TYPE))
    youtube_LP = sent_youtube_data.map(
        lambda x: create_labeled_points_youtube(x, REGRESSION_TYPE))
    facebook_LP = sent_facebook_data.map(
        lambda x: create_labeled_points_facebook(x, REGRESSION_TYPE))

    # split data in to training (80%) and test(20%) sets
    train_twitter, test_twitter = twitter_LP.randomSplit([0.8, 0.2], seed=0)
    train_youtube, test_youtube = youtube_LP.randomSplit([0.8, 0.2], seed=0)
    train_facebook, test_facebook = facebook_LP.randomSplit([0.8, 0.2], seed=0)

    #combine all 3 datasets with the RDD.union command
    train_LP = train_twitter.union(train_facebook).union(train_youtube)
    test_LP = test_twitter.union(test_facebook).union(test_youtube)

    # Build logistic regression model
    model_log = LogisticRegressionWithSGD.train(train_LP)
    if store == True:
        model_log.save(sc, file_path)

    # Evaluate the model on training data
    preds_train_log = train_LP.map(lambda p:
                                   (p.label, model_log.predict(p.features)))
    total_train = float(train_LP.count())
    trainErr_log = preds_train_log.filter(lambda
                                          (v, p): v != p).count() / total_train

    # Evaluate the model on test data
    preds_test_log = test_LP.map(lambda p:
                                 (p.label, model_log.predict(p.features)))
    total_test = float(test_LP.count())
    testErr_log = preds_test_log.filter(lambda
                                        (v, p): v != p).count() / total_test

    twitter_LP_count = twitter_LP.count()
    youtube_LP_count = youtube_LP.count()
    facebook_LP_count = facebook_LP.count()

    print('TWITTER LP COUNT %d' % (twitter_LP_count))
    print('YOUTUBE LP COUNT %d' % (youtube_LP_count))
    print('FACEBOOK LP COUNT %d' % (facebook_LP_count))

    print("Train Error = " + str(trainErr_log))
    print("Test Error = " + str(testErr_log))
    print(model_log)

    sc.stop()
Exemplo n.º 27
0
		label = 0	
	values = [x if x < genre else x-1 for x in values] #shift the attributes by one index
	ones = []
	ones = [1] * len(values)
	return LabeledPoint(label, SparseVector(column_num-1, values, ones))


#set hdfs path
data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*")
data = sc.sequenceFile("hdfs://localhost:9000/test/*")

parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v)))
parsedData.cache()

#Calculate total number of columns in the dataset
column_num = parsedData.flatMap(lambda _ : _ ).distinct().count()
column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect()
column_id.sort()

#choose a genre to test, default is 100th column as target variable
genre = 1

sortedData = parsedData.map(sortPoint)

labeledData = sortedData.map(lambda line : (line, genre)).map(labelData)

LRSGDmodel = LogisticRegressionWithSGD.train(labeledData)	

print LRSGDmodel.weights

Exemplo n.º 28
0
def parse_interaction_chi(line):
    line_split = line.split(",")
    # leave_out = [1,2,3,19,20.41]
    clean_line_split = line_split[0:1] + line_split[4:19] + line_split[21:41]
    attack = 1.0
    if line_split[41] == 'normal.':
        attack = 0.0
    return LabeledPoint(attack, np.array([float(x) for x in clean_line_split]))


training_data_chi = raw_data.map(parse_interaction_chi)
test_data_chi = test_raw_data.map(parse_interaction_chi)

t0 = time()
logit_model_chi = LogisticRegressionWithSGD.train(training_data_chi)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt, 3))

labels_and_preds = test_data_chi.map(
    lambda p: (p.label, logit_model_chi.predict(p.features)))
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(
    test_data_chi.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(
    round(tt, 3), round(test_accuracy, 4))

# ------------- RDD basics ------------------
Exemplo n.º 29
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt")
normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt")
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
normalFeatures = normal.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (normal) examples.


positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features))
trainingData = positiveExamples.union(negativeExamples)
trainingData.cache() # Cache since Logistic Regression is an iterative algorithm.
# Run Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Test on a positive example (spam) and a negative one (normal). We first apply
# the same HashingTF feature transformation to get vectors, then apply the model.
posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))
print "Prediction for positive test example: %g" % model.predict(posTest)
print "Prediction for negative test example: %g" % model.predict(negTest)

sqlContext = SQLContext(sc)

### Prepare Undirected Data

undirected_relation_df = sqlContext.read.load(undirected_relation_json_file,
                                              format="json")
#.limit(100)
undirectedParsedData = undirected_relation_df.map(
    lambda point: parsePoint(point))

print("@@@@@@@@@@@@@@@@@@@@@ 0 @@@@@@@@@@@@@@@@@@@@@@")
print("undirectedParsedData == " + str(undirectedParsedData.take(10)))
print("@@@@@@@@@@@@@@@@@@@@@ END0 @@@@@@@@@@@@@@@@@@@@@@")

### Build ModeL undirected
undirectedModel = LogisticRegressionWithSGD.train(undirectedParsedData)
undirectedModel.clearThreshold()
undirectedModel.save(sc, mainPath + "undireced_relation_model")
undirectedLabelsAndPreds = undirectedParsedData.map(lambda point: (
    point.label, float(undirectedModel.predict(point.features))))
undirectedLabelsAndPredsIndexed = undirectedLabelsAndPreds.zipWithIndex().map(
    lambda (x, y): (y, x))

print("@@@@@@@@@@@@@@@@@@@@@ 0 @@@@@@@@@@@@@@@@@@@@@@")
print("undirectedLabelsAndPredsIndexed == " +
      str(undirectedLabelsAndPredsIndexed.take(10)))
print("@@@@@@@@@@@@@@@@@@@@@ END0 @@@@@@@@@@@@@@@@@@@@@@")

######################################################
#join with productsIds
productsIds = undirected_relation_df.map(lambda point: getIds(point))
            .map(lambda lp: len(lp.features.indices))
            .sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')


# ** CTR prediction and logloss evaluation **
from pyspark.mllib.classification import LogisticRegressionWithSGD

# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept


# TEST Logistic regression
Test.assertTrue(np.allclose(model0.intercept,  0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
                [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
                 -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')


# ** Log loss **
from math import log

def computeLogLoss(p, y):
def train(row_id_str,
          ds_id,
          hdfs_feat_dir,
          local_out_dir,
          ml_opts_jstr,
          excluded_feat_cslist,
          sp_master,
          spark_rdd_compress,
          spark_driver_maxResultSize,
          sp_exe_memory,
          sp_core_max,
          zipout_dir,
          zipcode_dir,
          zip_file_name,
          mongo_tuples,
          labelnameflag,
          fromweb,
          training_fraction,
          jobname,
          random_seed=None):

    ### generate data folder and out folder, clean up if needed
    #local_out_dir = local_out_dir + "/"
    #if os.path.exists(local_out_dir):
    #    shutil.rmtree(local_out_dir) # to keep smaplelist file
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    ml_opts = {}
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]
    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file:", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    # get distinct label list
    labels_list_all = samples_rdd.map(
        lambda p: p[0].label).distinct().collect()

    # split samples to training and testing data, format (LabeledPoint,hash)
    training_rdd, testing_rdd = samples_rdd.randomSplit(
        [training_fraction, 1 - training_fraction], seed=int(random_seed))
    training_rdd = training_rdd.map(lambda p: p[0])  # keep LabeledPoint only
    training_rdd.cache()
    training_sample_count = training_rdd.count()
    training_lbl_cnt_list = training_rdd.map(
        lambda p: (p.label, 1)).reduceByKey(add).collect()
    testing_rdd.cache()
    testing_sample_count = testing_rdd.count()
    testing_lbl_cnt_list = testing_rdd.map(
        lambda p: (p[0].label, 1)).reduceByKey(add).collect()
    sample_count = training_sample_count + testing_sample_count

    # random_seed testing
    if not random_seed is None:
        all_t = testing_rdd.collect()
        all_t = sorted(all_t, key=lambda x: x[1])
        cnt = 0
        for i in all_t:
            print i[1]
            cnt = cnt + 1
            if cnt > 3:
                break

    t1 = time()
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: labels_list_all=", labels_list_all
    print "INFO: training and testing samples generated!"
    print 'INFO: running time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########build learning model################
    ###############################################

    ### get the parameters###
    print "INFO: ======Learning Algorithm and Parameters============="
    #ml_opts = json.loads(ml_opts_jstr)
    model_name = ml_opts[
        'learning_algorithm']  # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd
    iteration_num = 0
    if 'iterations' in ml_opts:
        iteration_num = ml_opts['iterations']
    C = 0
    if 'c' in ml_opts:
        C = eval(ml_opts['c'])
    regularization = ""
    if 'regularization' in ml_opts:
        regularization = ml_opts['regularization']

    print "INFO: Learning Algorithm: ", model_name
    print "INFO: C = ", C
    print "INFO: iterations = ", iteration_num
    print "INFO: regType = ", regularization
    regP = C / float(training_sample_count)
    print "INFO: Calculated: regParam = ", regP

    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        '''
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
 
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
 
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        print "INFO: dic_list=",dic_list
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        '''
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels:", labels_list
    class_num = len(labels_list)
    if class_num > 2:
        print "INFO: Multi-class classification! Number of classes = ", class_num

    ### build model ###

    if model_name == "linear_svm_with_sgd":
        ### 1: linearSVM
        print "INFO: ====================1: Linear SVM============="
        model_classification = SVMWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
        #print model_classification
    elif model_name == "logistic_regression_with_lbfgs":
        ### 2: LogisticRegressionWithLBFGS
        print "INFO: ====================2: LogisticRegressionWithLBFGS============="
        model_classification = LogisticRegressionWithLBFGS.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization,
            numClasses=class_num)  # regParam = 1/(sample_number*C)
    elif model_name == "logistic_regression_with_sgd":
        ### 3: LogisticRegressionWithSGD
        print "INFO: ====================3: LogisticRegressionWithSGD============="
        model_classification = LogisticRegressionWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return

    print "INFO: model type=", type(model_classification)

    # create feature coefficient file ================================
    coef_arr = None
    intercept = None
    if model_classification.weights is None:
        print "WARNING: model weights not found!"
    else:
        coef_weights = model_classification.weights
        #print "coef_weights=",coef_weights
        #print type(coef_weights),coef_weights.shape
        coef_arr = coef_weights.toArray().tolist()
        # save coef_arr to mongo
        key = "coef_arr"
        ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples)

        # save coef_arr to local file
        if ret == 0:
            # drop old record in mongo
            filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}'
            ret = query_mongo.delete_many(mongo_tuples, None, filter)
            if not os.path.exists(local_out_dir):
                os.makedirs(local_out_dir)
            fn_ca = os.path.join(local_out_dir, row_id_str,
                                 row_id_str + "_coef_arr.pkl")
            print
            ml_util.ml_pickle_save(coef_arr, fn_ca)

        # save intercept to mongo
        intercept = model_classification.intercept
        key = "coef_intercept"
        ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples)

        # feature list + coef file =============
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename

        # create feature, coef & raw string file =============================================== ============
        # expect a dict of {"fid":(coef, feature_raw_string)}
        jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None,
                                         coef_arr, ds_id, mongo_tuples)

        # special featuring for IN or libsvm
        if jret is None:
            jret = ml_util.build_feat_coef_raw_list_t(row_id_str,
                                                      feat_filename, coef_arr,
                                                      ds_id, mongo_tuples)
        if jret is None:
            print "WARNING: Cannot create sample list for testing dataset. "

        jfeat_coef_dict = jret
        print "INFO: coef_arr len=", len(
            coef_arr), ", feature_count=", feature_count
        # for multi-class
        if len(coef_arr) != feature_count:
            jfeat_coef_dict = {}
            print "WARNING: coef count didn't match feature count.  multi-class classification was not supported"

        # Calculate prediction and Save testing dataset
        bt_coef_arr = sc.broadcast(coef_arr)
        bt_intercept = sc.broadcast(intercept)
        bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict)
        ### Evaluating the model on testing dataset: label, predict label, score, feature list
        print "INFO: intercept=", intercept
        print "INFO: coef_arr len=", len(coef_arr), type(coef_arr)
        print "INFO: jfeat_coef_dict len=", len(
            jfeat_coef_dict)  #, jfeat_coef_dict

        # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ==============================
        if len(coef_arr) == feature_count:
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        else:  # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,"-" \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \
        # Save testing dataset for analysis
        libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str
        print "INFO: libsvm_testing_output=", libsvm_testing_output
        try:
            hdfs.rmr(libsvm_testing_output)
        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] 
        # save only false prediction?
        #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output)
        testing_pred_rdd.saveAsTextFile(libsvm_testing_output)
        
        '''
        #test_tmp=testing_pred_rdd.collect()

        # save false prediction to local file
        false_pred_fname = os.path.join(local_out_dir,
                                        row_id_str + "_false_pred.json")
        print "INFO: false_pred_fname=", false_pred_fname
        false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\
            .map(lambda p: (p[0],p[1],p[2] \
            ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value)
            ,p[4]  ) ) \
            .collect()
        print "INFO: false predicted count=", len(false_pred_data)
        false_pred_arr = []
        with open(false_pred_fname, "w") as fp:
            for sp in false_pred_data:
                jsp = {
                    "tlabel": sp[0],
                    "plabel": sp[1],
                    "score": sp[2],
                    "feat": sp[3],
                    "hash": sp[4]
                }
                #print "jsp=",jsp
                false_pred_arr.append(jsp)
            fp.write(json.dumps(false_pred_arr))

        # save prediction results, format: label, prediction, hash
        pred_ofname = os.path.join(local_out_dir,
                                   row_id_str + "_pred_output.pkl")
        print "INFO: pred_ofname=", pred_ofname
        pred_out_arr = testing_pred_rdd.map(lambda p:
                                            (p[0], p[1], p[4])).collect()
        ml_util.ml_pickle_save(pred_out_arr, pred_ofname)
        '''
        one_item= testing_pred_rdd.first()
        print "one_item=",one_item
        sparse_arr=one_item[3]

        dict_feat=zip_feature_util.sparseVector2dict(sparse_arr)
        print "len=",len(dict_feat),"dict_feat=",dict_feat
        dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat)
        print "len=",len(dict_weit),"dict_weit=",dict_weit
        '''
    # Calculate Accuracy. labelsAndPreds = (true_label,predict_label)
    labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1]))
    labelsAndPreds.cache()
    testing_sample_number = testing_rdd.count()
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        testing_sample_number)
    accuracy = 1 - testErr
    print "INFO: Accuracy = ", accuracy

    ### Save model
    #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/'
    #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'),
                            config.get('app', 'HDFS_MODEL_DIR'), row_id_str)
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(
            e.errno, e.strerror), ". At HDFS=", save_dir
    except:
        print "WARNING: Unexpected error:", sys.exc_info(
        )[0], ". At HDFS=", save_dir
    model_classification.save(sc, save_dir)

    ###load model if needed
    #sameModel = SVMModel.load(sc, save_dir)

    t1 = time()
    print 'INFO: training run time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########plot prediction result figure ==================================================== ===============
    ###############################################

    labels = labelsAndPreds.collect()
    true_label_list = [x for x, _ in labels]
    pred_label_list = [x for _, x in labels]

    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(
        pred_label_list, true_label_list, labels_list, label_dic,
        testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    #print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc = None
    perf_measures = None
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_num,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve) ==================================================== ===============
    #############################################################
    if len(labels_list) == 2:

        do_ROC = True
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!"
            do_ROC = False

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname

        # build score_arr_0, score_arr_1
        #    format: tlabel, plabel, score, libsvm, raw feat str, hash
        graph_arr = testing_pred_rdd.map(lambda p:
                                         (int(p[0]), float(p[2]))).collect()
        score_arr_0 = []
        score_arr_1 = []
        max_score = 0
        min_score = 0
        for p in graph_arr:
            if p[0] == 0:
                score_arr_0.append(p[1])
            else:
                score_arr_1.append(p[1])
            # save max,min score
            if p[1] > max_score:
                max_score = p[1]
            elif p[1] < min_score:
                min_score = p[1]

        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)

        if do_ROC:

            perf_measures = ml_util.calculate_fscore(true_label_list,
                                                     pred_label_list)
            print "RESULT: perf_measures=", perf_measures
            '''
            # calculate fscore  ==========
            tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() 
            fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() 
            fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() 
            tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() 
            print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn
            precision=float(tp)/(tp+fp)
            recall=float(tp)/(tp+fn)
            print "RESULT: precision=",precision,",recall=",recall
            acc=(tp+tn)/(float(testing_sample_number))
            fscore=2*((precision*recall)/(precision+recall))
            print "RESULT: fscore=",fscore,",acc=",acc  
            '''
            model_classification.clearThreshold()
            scoreAndLabels = testing_rdd.map(lambda p: (
                model_classification.predict(p[0].features), int(p[0].label)))
            #metrics = BinaryClassificationMetrics(scoreAndLabels)
            #areROC = metrics.areaUnderROC
            #print areROC
            scoreAndLabels_list = scoreAndLabels.collect()

            if flag_clean == 0:
                scores = [x for x, _ in scoreAndLabels_list]
                s_labels = [x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x, _ in scoreAndLabels_list]
                s_labels = [1 - x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]

            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)
            #, local_out_dir, file_name_given)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request ==================================================== ===============
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    print 'INFO: Finished!'
    return 0
Exemplo n.º 33
0
X_train = sc.parallelize(np.random.uniform(0,10,[nb_train,P]))
w = np.random.uniform(size=[P+1,])
y_train = X_train.map(f = lambda a: w[0] + np.dot(a,w[1:]))
y_train_mean = y_train.mean()
y_train = y_train.map(f = lambda val: 1 if val > y_train_mean else 0)
data_train = y_train.zip(X_train).map(f = lambda tu: LabeledPoint(tu[0],tu[1]))


X_test = sc.parallelize(np.random.uniform(0,10,[nb_test,P]))
y_test = X_test.map(f = lambda a: w[0] + np.dot(a,w[1:]))
y_test_mean = y_test.mean()
y_test = y_test.map(f = lambda val: 1 if val > y_test_mean else 0)

 
t1 = time.time()    
lrm = LogisticRegressionWithSGD.train(data_train,iterations = 10)
y_pred = lrm.predict(X_test)

print "*******************************"
nb_corr = np.sum(np.array(y_test.collect()) == np.array(y_pred.collect()))
print nb_corr
print "the accuracy is ", nb_corr/float(nb_test)
print "*******************************"
t2 = time.time()
print "time elapsed spark logistic regression ", t2 - t1



lrm_bfgs = LogisticRegressionWithLBFGS.train(data_train)
y_pred = lrm_bfgs.predict(X_test)
print "*******************************"
Exemplo n.º 34
0
    return sc

sc = getSparkContext()

# Load and parse the data
data = sc.textFile("/data.txt")

def mapper(line):
    """
    Mapper that converts an input line to a feature vector
    """    
    feats = line.strip().split(",") 
    # labels must be at the beginning for LRSGD, it's in the end in our data, so 
    # putting it in the right place
    label = feats[len(feats) - 1] 
    feats = feats[: len(feats) - 1]
    feats.insert(0,label)
    features = [ float(feature) for feature in feats ] # need floats
    return LabeledPoint(label, features)

parsedData = data.map(mapper)

model = LogisticRegressionWithSGD.train(parsedData, iterations=100)

labelsAndPreds = parsedData.map(lambda point: (point.label, model.predict(point.features)))

trainErr = labelsAndPreds.filter(lambda p: p[0] != p[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))


Exemplo n.º 35
0
# Imports
# SGD = Stochastic Gradient Descent. Convex optimization to optimize objective functions.
from pyspark.mllib.classification import LogisticRegressionWithSGD

from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext
from numpy import array

sc = SparkContext("local", "SVM")


# Loading and parsing data
def parsePoint(line):
    vals = [float(i) for i in line.split(' ')]
    return LabeledPoint(vals[0], vals[1:])


# Sample data provided by Spark 1.3.1 folder
data = sc.textFile("jingrong/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Building the model
model = LogisticRegressionWithSGD.train(parsedData)

# Evaluate the model based on training data
labelAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainingError = labelAndPreds.filter(lambda (v, p): v != p).count() / float(
    parsedData.count())

print "Training Error: ", str(trainingError)
Exemplo n.º 36
0
                                   random_state=0)

for train_index, test_index in ss:
    X_training, Y_training, X_test, Y_test = [], [], [], []
    for i in train_index:
        X_training.append(X[i])
        Y_training.append(Y[i])
    for i in test_index:
        X_test.append(X[i])
        Y_test.append(Y[i])

    parsedData = []
    for i in range(0, len(X_training)):
        parsedData.append(LabeledPoint(Y_training[i], X_training[i]))

    model = LogisticRegressionWithSGD.train(sc.parallelize(parsedData))
    model.clearThreshold()
    probas = []

    for i in range(0, len(X_test)):
        b = model.predict(X_test[i])
        probas.append(b)
    # Compute ROC curve and area the curve
    tpr, fpr, thresholds = roc_curve(Y_test, probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
    if len(sys.argv) != 3:
        logger.error(USAGE)
        sys.exit(0)

    trainFile = sys.argv[1]
    testFile = sys.argv[2]

    sc = SparkContext("local", "SVM: Schizophrenia")

    trainData = sc.textFile(trainFile)
    # sc.parallelize( trainData )
    train = trainData.map(parsePoint)
    train.persist()

    testData = sc.textFile(testFile)
    # sc.parallelize( testData )
    test = testData.map(parsePoint)
    test.persist()

    model = LogisticRegressionWithSGD.train(train)
    labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
    # accuracy = labelsAndPreds.filter(lambda (v, p): True if p == 1.0 else False ).count() / float(test.count())
    error = labelsAndPreds.filter(
        lambda (v, p): int(v) != int(p)).count() / float(test.count())

    with open("error.txt", "w") as f:
        f.write("Accuracy: {0}\n".format(1 - error))
        f.write("Error: {0}\n".format(error))

    labelsAndPreds.saveAsTextFile(str(time.time()) + ".txt")
Exemplo n.º 38
0
    #Test the accuracy of predicition and print the time taken
    start = timer()
    test_accuracy = trainingLabelAndPreds1.filter(
        lambda (v, p): v == p).count() / float(testData.count())
    end = timer()
    elapsed = end - start
    print '\nPrediction made in: ', elapsed, 'seconds with LBFGS'
    print '\nTest Accuracy is: ', round(test_accuracy, 4)
    trainingError1 = trainingLabelAndPreds1.map(
        lambda (r1, r2): float(r1 != r2)).mean()
    print '\nLBFGS training error: ', trainingError1

elif modelSelection == 'sgd':
    start = timer()
    model2 = LogisticRegressionWithSGD.train(trainingData,
                                             iterations=50,
                                             intercept=True)
    end = timer()
    elapsed = end - start
    globalModel = model2
    print '\nClassifier trained in ', elapsed, ' seconds with SGD'

    # Evaluate the training and test errors
    trainingLabelAndPreds2 = trainingData.map(
        lambda point: (point.label, model2.predict(point.features)))

    #Test the accuracy of predicition and print the time taken
    start = timer()
    test_accuracy = trainingLabelAndPreds2.filter(
        lambda (v, p): v == p).count() / float(testData.count())
    end = timer()
from pyspark.mllib.regression import LabeledPoint
from numpy import array
import parse
# Load and parse the data

#def parsePoint(line):   # Creating vector(array) with first input as y and others as xi's   
#    values = [float(x) for x in line.split(',')]
#    return LabeledPoint(values[10], values[0:9])


sc = SparkContext("local[4]", "Logistic Regression")      #Initialized SparkContext
data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data")  #Created an RDD
parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features)

# Build the model
model = LogisticRegressionWithSGD.train(parsedData)   #Pass an RDD to "train" method of class LogisticRegressionwithSGD
#Use model to create output
#model.predict().collect()    # in "predict" method we have to pass an array
#Read Test data

Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test")
parsedTestData = Testdata.map(parse.parsePoint)
#predict result for each Test Data

# Evaluating the model on training data

labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features)))  #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint) 
millis2 = int(round(time.time() * 1000))

print labelsAndPreds.collect()
#Print testing Error
Exemplo n.º 40
0
# 17    3.0   14075  0.042509
# 18    2.0   16520  0.049893
# 19    1.0   32130  0.097037
# 20    0.0  186485  0.563212

#  Genre_Name|label|        raw_Features|
#Create labeledPoints from a Spark DataFrame using Pyspark
training  = training_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) 
test  = test_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray()))    
#label  features 

#========LogisticRegressionModel
 
# Run training algorithm to build the model
#lr_model = LogisticRegressionWithSGD.train(sc.parallelize(training), validateData=False)
lr_model = LogisticRegressionWithSGD.train(training, validateData=False)
# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(lr_model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix().toArray()

#Overall statistics
print("Recall = %s" % metrics.recall())
print("Precision = %s" % metrics.precision())
print("F1 measure = %s" % metrics.fMeasure())
print("Accuracy = %s" % metrics.accuracy)

# Recall = 0.09641896742635338
# Precision = 0.09641896742635338
Exemplo n.º 41
0
    print(BASE_DATA_PATH)

    conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # read the dataset
    df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load(
        BASE_DATA_PATH + '/test.csv')

    training = df_test.map(lambda row: LabeledPoint(row.IsClick,
                                                    [float(row.SearchID), float(row.AdID), float(row.Position),
                                                     float(row.HistCTR), float(row.Price)]))

    (trainingData, testData) = training.randomSplit([0.7, 0.3])

    model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4)



    # Build the model
    model1 = SVMWithSGD.train(trainingData, iterations=100)




    # Evaluate the model on training data


    model2 = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
Exemplo n.º 42
0
OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(
    point, ctrOHEDict, numCtrOHEFeats))  ##create validation labeled points
OHEValidationData.cache()

# running first model with fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

print "-------------logistic regression with gradient descent---------"
model0 = LogisticRegressionWithSGD.train(
    data=OHETrainData,
    iterations=numIters,
    step=stepSize,
    regParam=regParam,
    regType=regType,
    intercept=includeIntercept)  ##train model
sortedWeights = sorted(model0.weights)
print "------------/logistic regression with gradient descent---------"


def computeLogLoss(p, y):

    epsilon = 10e-12
    if (p == 0):
        p = p + epsilon
    elif (p == 1):
        p = p - epsilon
            .map(lambda lp: len(lp.features.indices))
            .sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')


# CTR预估和对数损失函数评估,引用MLlib API

from pyspark.mllib.classification import LogisticRegressionWithSGD

numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(OHETrainData,iterations=numIters,step=stepSize,regParam=regParam,regType=regType,intercept=includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept

Test.assertTrue(np.allclose(model0.intercept,  0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
                [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
                 -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')


# log损失
from math import log

def computeLogLoss(p, y):
    epsilon = 10e-12
    if y == 1 :
Exemplo n.º 44
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD


def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:])


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: logistic_regression <file> <iterations>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonLR")
    points = sc.textFile(sys.argv[1]).map(parsePoint)
    iterations = int(sys.argv[2])
    model = LogisticRegressionWithSGD.train(points, iterations)
    print("Final weights: " + str(model.weights))
    print("Final intercept: " + str(model.intercept))
    sc.stop()
OHETrainData = rawTrainData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create train labeled points
OHETrainData.cache() ##cache

OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create validation labeled points
OHEValidationData.cache()

# running first model with fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

print "-------------logistic regression with gradient descent---------"
model0 = LogisticRegressionWithSGD.train(data=OHETrainData, iterations=numIters, step=stepSize,regParam=regParam, regType=regType, intercept=includeIntercept) ##train model
sortedWeights = sorted(model0.weights)
print "------------/logistic regression with gradient descent---------"


def computeLogLoss(p, y):
   
    epsilon = 10e-12
    if (p==0):
      p = p + epsilon
    elif (p==1):
      p = p - epsilon
      
    if y == 1:
      z = -log(p)
    elif y == 0:
Exemplo n.º 46
0
cutoff = float(nrock) / (nrock + nxrock)

# recombine
equalSampleData = labeledRock.union(labeledNotRock)


equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True)

# split data
trainData, testData = randomSplit(equalSampleData, [0.9, 0.1])

trainData.map(lambda p: (p.label, p.features)).take(3)

# train model
model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000)
# model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000)
# model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True)

# evaluate model
# labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0))
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))

accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count())

guess1 = labelsAndPreds.filter(lambda (v, p): p == 1)
precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count())

act1 = labelsAndPreds.filter(lambda (v, p): v == 1)
recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
Exemplo n.º 47
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
Exemplo n.º 48
0
    label = int(trimmed[-1])
    features = [convert_na_nb(r) for r in trimmed[4:-1]]
    return LabeledPoint(label, Vectors.dense(features))


data = records.map(dealwith)
total_count = data.count()
#为了朴素贝叶斯
nbdata = records.map(dealwithNB)
#print data.first()

numIterations = 10
maxTreeDepth = 5

#训练逻辑回归模型
lrModel = LogisticRegressionWithSGD.train(data, numIterations)

#训练支持向量机模型
svmModel = SVMWithSGD.train(data, numIterations)

#训练朴素贝叶斯模型
nbModel = NaiveBayes.train(nbdata)

#训练决策树模型
#dtModel=DecisionTree.train(data, Algo.Classification, Entropy, maxDepth)
dtModel = DecisionTree.trainClassifier(data,
                                       numClasses=2,
                                       categoricalFeaturesInfo={},
                                       impurity='entropy',
                                       maxDepth=maxTreeDepth,
                                       maxBins=32)
Exemplo n.º 49
0
    return log_loss


# In[10]:

# try fixed hyperparameters
numIters = 500
stepSize = 1
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(rawTrainData,
                                         iterations=numIters, 
                                         step=stepSize, 
                                         miniBatchFraction=1.0, 
                                         initialWeights=None, 
                                         regParam=regParam, 
                                         regType=regType, 
                                         intercept=includeIntercept)
print model0.weights, model0.intercept


# In[11]:

classOneFracTrain = (rawTrainData.map(lambda x: x.label)
                                 .reduce(lambda x, y: x+y))/rawTrainData.count()
print classOneFracTrain

logLossTrBase = (rawTrainData.map(lambda x: x.label)
                             .map(lambda x: computeLogLoss(classOneFracTrain, x))
                             .reduce(lambda x, y: x+y))/rawTrainData.count()
Exemplo n.º 50
0
def train_logistic_regression(trainRDD):
    return LogisticRegressionWithSGD.train(trainRDD, iterations=10)
Exemplo n.º 51
0
 def train(self, num_iterations=10):
     model = LogisticRegressionWithSGD.train(
         self._labeled_feature_vector_rdd(), 
         num_iterations)
     return LogisticRegressionModel(model, self.feature_cols)
    hashValidationData.cache()
    hashTestData = rawTestData.map(lambda x: parseHashPoint(x, numBucketsCTR))
    hashTestData.cache()

    # ===================================================
    # train logistic regression model
    # ===================================================
    numIters = 100
    stepSize = 10.
    regParam = 0.  # no regularization
    regType = 'l2'
    includeIntercept = True

    model = LogisticRegressionWithSGD.train(hashTrainData,
                                            iterations=numIters,
                                            step=stepSize,
                                            regParam=regParam,
                                            regType=regType,
                                            intercept=includeIntercept)
    sortedWeights = sorted(model.weights)

    sys.stderr.write('\n Model Intercept: {0}'.format(model.intercept))
    sys.stderr.write('\n Model Weights (Top 5): {0}\n'.format(
        sortedWeights[:5]))

    l_metrics = []

    l_metrics.append(evaluateMetrics(model, hashTrainData, 'TRAIN'))
    l_metrics.append(evaluateMetrics(model, hashValidationData, 'VALIDATE'))
    l_metrics.append(evaluateMetrics(model, hashTestData, 'TEST'))

    sc.parallelize(l_metrics).saveAsTextFile(sys.argv[4])
Exemplo n.º 53
0
    def create_model(self, data, params):

        numIterations = int(params.get('numIterations', 10))

        points = data.map(self.parsePoint)
        return LogisticRegressionWithSGD.train(points, numIterations)
Exemplo n.º 54
0
        return None


#set hdfs path
#data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*")
data = sc.textFile(
    "hdfs://nameservice1/user/geap/warehouse/geap.db/user_hist_plain/year=2015/*/*/*/*"
)

parsedData = data.filter(filterPoint).map(parsePoint).filter(
    lambda kv: kv != None).reduceByKey(lambda x, y: x + y).map(
        lambda (k, v): list(set(v)))
parsedData.cache()

#Calculate total number of columns in the dataset
column_num = parsedData.flatMap(lambda _: _).distinct().count()
column_id = parsedData.flatMap(lambda _: _).distinct().collect()
column_id.sort()

#choose a genre to test, default is 100th column as target variable
genre = 1

sortedData = parsedData.map(sortPoint).filter(lambda p: p != None)

labeledData = sortedData.map(lambda line: (line, genre)).map(labelData).filter(
    lambda p: p != None)

LRSGDmodel = LogisticRegressionWithSGD.train(labeledData)

print LRSGDmodel.weights
def anom_with_lr():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from the unlabeled 100K
    sqlContext.registerFunction("my_random", lambda x: x - x + random())
    sqlContext.registerDataFrameAsTable(benign, "benign")
    benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign")
    
    threshold = 50000/n_benign
    into_model = benign.filter(benign.random_number <= threshold)
    for_finding_more = benign.filter(benign.random_number > threshold)
    
    for_modeling = anom.unionAll(into_model.drop(into_model.random_number))
    for_finding_more = for_finding_more.drop(for_finding_more.random_number)
    #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among
    #the selected 10,000, have probabilities around 0.05
    
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) 
            + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    all_columns = for_modeling.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]

    #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect()
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    for_modeling = for_modeling.rdd
    print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    t0 = time()
    #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds
    model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) 
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds
    
    labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features)))
    test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size)

    fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282
    
    model.clearThreshold()
    for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK
    for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK
    
    try:
      for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD.
      #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000
      #for_finding_more.take(5)
    except EOFError:
      print("EOF handled")
      
    df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom'])
    df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement.
    df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has 
    #probability of 0.86818, last one has probability 0.5928958
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return for_finding_more
Exemplo n.º 56
0
    pos_file = "data/training_positif_clean.csv"
    neg_file = "data/training_negatif_clean.csv"

    training_idf = training_set(pos_file, neg_file)
    training = training_idf[0]
    idf = training_idf[1]

    test_file = "data/test_clean" + str(part) + ".csv"
    test = test_set(test_file, idf)

    print("\nDone : Tf-IDF training and test sets")

    ###########################################################################
    #########                      Model Training                     #########

    model_regression = LogisticRegressionWithSGD.train(training)
    print("Done : regression training ")

    ###########################################################################
    #########                     Model Testing                       #########

    #regression
    predictions_regression = model_regression.predict(test)
    num_pos_regression = predictions_regression.countByValue()[1.0]
    num_neg_regression = predictions_regression.countByValue()[0.0]

    print("\n== PREDICTION REGRESSION : ==\n")
    print("- Positive : ", num_pos_regression)
    print("- Negative : ", num_neg_regression)

    file.write(
Exemplo n.º 57
0
# 6.	Create LabeledPoint datasets for positive (spam) and negative (ham) examples. A LabeledPoint consists simply of a label and a features vector.

positive_examples = spam_features.map(
    lambda features: LabeledPoint(1, features))
negative_examples = ham_features.map(
    lambda features: LabeledPoint(0, features))

# 7. Create training data and cache it since Logistic Regression is an iterative algorithm. Examine the training data with collect action.

training_data = positive_examples.union(negative_examples)
training_data.cache()
training_data.collect()

# 8.	Run Logistic Regression using the SGD optimizer and then check the model contents.

model = LogisticRegressionWithSGD.train(training_data)
model

# 9.	Test on a positive example (which is a spam) and a negative one (which is a ham). Apply the same HashingTF feature transformation algorithm used on the training data.

pos_example = tf.transform("No investment required".split(" "))
neg_example = tf.transform(
    "Data Science courses recommended for you".split(" "))

# 10.	Now use the learned model to predict spam/ham for new emails.

print "Prediction for positive test: %g" % model.predict(pos_example)
# Prediction for positive test: 1
print "Prediction for negative test: %g" % model.predict(neg_example)
# Prediction for negative test: 0
Exemplo n.º 58
0
train_data_formatted = train_data.map(create_labeled_point)

# Obtengo el minimo de cada feature para sumarlo
train_min_feat = train_data_formatted.map(lambda x: x.features).reduce(
    lambda a, b: np.minimum(a, b))

# resto el minimo para tener todos feats positivos (+0)
train_data_formatted_pos = train_data_formatted.map(
    lambda x: LabeledPoint(x.label, x.features - train_min_feat))

# Clasificacion usando dos modelos
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.classification import LogisticRegressionWithSGD

modelNB = NaiveBayes.train(train_data_formatted_pos)
modelLR = LogisticRegressionWithSGD.train(train_data_formatted_pos)

# Leo datos de test
test_full = sc.textFile('file:///home/cloudera/bank-additional.csv')
test_data_raw = test_full.filter(lambda row: row != train_header)
test_data = test_data_raw.map(lambda line: line.replace('"', '').replace(
    '\n', '').replace('\r', '').split(';'))
test_data = test_data.filter(lambda vec: len(vec) == 21)

test_data_formatted = test_data.map(create_labeled_point)
test_data_features = test_data_formatted.map(
    lambda x: x.features - train_min_feat)
test_data_true_label = test_data_formatted.map(lambda x: x.label).collect()

print test_data_true_label[:100]
table1 = sc.textFile("/user/team322/junli_testFeature/*")
def f1(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	userID = line.split(',')[0]
	return userID
user = table1.map(f1).collect() #select the users of validation data
result6 = sc.textFile("/user/team322/junli_trainFeature/*")
# Load and parse the data
def parsePoint(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	line = line.split(',')
	values = [float(x) for x in line[2:]] #select label Column and features Columns 
	return LabeledPoint(values[0], values[1:])
parsedData = result6.map(parsePoint)
# Build the model
model = LogisticRegressionWithSGD.train(parsedData)
result7 = sc.textFile("/user/team322/junli_testFeature/*")
def testParsePoint(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	line = line.split(',')
	values = [float(x) for x in line[1:]] #select label Column and features Columns
	return LabeledPoint(values[0], values[1:])
parsedData2 = result7.map(testParsePoint)
preds = parsedData2.map(lambda p: model.predict(p.features)) #use the model to predict parsedData2
preds = preds.collect() #translate the result of predict into list
userID = []
for i in xrange(len(preds)): #select users whose predict is 1
	if preds[i] == 1:
		userID.append(user[i])
sc.parallelize(userID).saveAsTextFile('/user/team322/solution_v') #create a parallelized collection and save it 
t2 = time.ctime()
Exemplo n.º 60
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass