Пример #1
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Пример #2
0
def svmClassification(trainSetFile,testSetFile):

    data1 = sc.textFile(directory_supervised + trainSetFile)
    trainData = data1.map(parsePoint)
    data2 = sc.textFile(directory_supervised + testSetFile)
    testData = data2.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(trainData, iterations=10)

    # Evaluating the model on training data
    '''labelsAndPreds = trainData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainData.count())
    print("Training Error = " + str(trainErr))
    labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
    print("Test Error = " + str(testErr))
    return testErr'''
    #labelsAndPreds = testData.map(lambda p: (p.label, float(model.predict(p.features))))
    #truePos = labelsAndPreds.filter(lambda p: p[0] == p[1]).count()
    #print("True pos : " + str(truePos))
    #metrics1 = MulticlassMetrics(labelsAndPreds)
    #print("Recall : " + str(metrics1.recall()))
    #print("Precision : " + str(metrics1.precision()))
    #print(metrics1.confusionMatrix())

    model.clearThreshold()
    scoreAndLabels = testData.map(lambda p: (float(model.predict(p.features)), p.label))
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    return metrics.areaUnderROC
def modelWithSVM(trainingData, validationData):
	##Train the model using Support Vector Machines with different values of iterations.
	##Return the SVM model with best accuracy rate
	
	#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
	regularizationParamater = [.0000001, 1., 5000., 10000., 200000.]
	bestSVMModel = None
	bestAccuracy = 0
	numOfIterations = 100
	visualizationData = []
	
	
		
	for regularizer in regularizationParamater:

		model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestSVMModel = model
				
			
				
	return bestSVMModel, visualizationData
def main():
	stock_file = sys.argv[1]
	output_predict_file = sys.argv[2]

	conf = SparkConf().setAppName('Stock Prediction Machine Learning with Twitter')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	''' extracting the header of CSV file'''
	file_data_all = sc.textFile(stock_file)
	file_header = file_data_all.first()
	file_data = file_data_all.filter(lambda line: line != file_header).cache()

	''' for five different predictions getting data '''
	parsedFileData_NextDayActualOpening = file_data.map(parseNextDayActualOpening)
	parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh)
	parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow)
	parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose)
	parsedFileData_NextDayActualVolume = file_data.map(parseNextDayActualVolume)

	print(parsedFileData_NextDayActualOpening.collect())

	''' calling SVM with Stochastic Gradient Descent and
	training using our data set '''
	svm_model_nxtdayactopn = SVMWithSGD.train(parsedFileData_NextDayActualOpening, iterations=10)

	lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (line.label, svm_model_nxtdayactopn.predict(line.features)))

	print(lpreds.collect())
Пример #5
0
def sendRecord(tup):
    if (not tup.isEmpty()):
        rdd_arr = tup.collect()
        el = rdd_arr[0][1].split('\n')
        tmp_file = get_tmpfile("/home/cloudera/Desktop/test_word2vec.txt")
        model = KeyedVectors.load_word2vec_format(tmp_file)
        vectores = []
        normal_v = []
        for i in el:
            value = i[:1]
            text = i[2:]
            text_arr = text.split(' ')
            vector = [0] * 50
            total = 1
            for j in text_arr:
                try:
                    vector += model.get_vector(j)
                    total += 1
                except:
                    pass
            vector = vector / total
            label = LabeledPoint(int(value), vector)
            vectores.append(label)
            normal_v.append([int(value), vector])
        vectores.append(LabeledPoint(0.0, [0] * 50))
        model = SVMWithSGD.train(sc.parallelize(vectores), iterations=100)
        #model.save(sc, "/home/cloudera/pythonSVMWithSGDModel")
        pred = []
        match_0 = 0
        nmatch_0 = 0
        match_1 = 0
        nmatch_1 = 0
        final = []
        for i in normal_v:
            pre = model.predict(i[1])
            if (i[0] == 0):
                if (pre == i[0]):
                    match_0 += 1
                else:
                    nmatch_0 += 1
            else:
                if (pre == i[0]):
                    match_1 += 1
                else:
                    nmatch_1 += 1
            final.append([pre, i[0], i[1]])
        print('\n\n\n\n\n\n\n\n\n\n\n')
        print('\n\n\n\n\n\n\n\n\n\n\n')
        print([[match_0, nmatch_0], [nmatch_1, match_1]])
        print('\n\n\n\n\n\n\n\n\n\n\n')
        print('\n\n\n\n\n\n\n\n\n\n\n')
        print(len(final))
        print('\n\n\n\n\n\n\n\n\n\n\n')
        print('\n\n\n\n\n\n\n\n\n\n\n')
        f = open('/home/cloudera/vectores' + str(uuid.uuid4()) + '.txt', 'w')
        for i in final:
            for j in i:
                f.write('%s' % j)
            f.write('\n')
        f.close()
def main():
    stock_file = sys.argv[1]
    output_predict_file = sys.argv[2]

    conf = SparkConf().setAppName(
        'Stock Prediction Machine Learning with Twitter')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'
    ''' extracting the header of CSV file'''
    file_data_all = sc.textFile(stock_file)
    file_header = file_data_all.first()
    file_data = file_data_all.filter(lambda line: line != file_header).cache()
    ''' for five different predictions getting data '''
    parsedFileData_NextDayActualOpening = file_data.map(
        parseNextDayActualOpening)
    parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh)
    parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow)
    parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose)
    parsedFileData_NextDayActualVolume = file_data.map(
        parseNextDayActualVolume)

    print(parsedFileData_NextDayActualOpening.collect())
    ''' calling SVM with Stochastic Gradient Descent and
	training using our data set '''
    svm_model_nxtdayactopn = SVMWithSGD.train(
        parsedFileData_NextDayActualOpening, iterations=10)

    lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (
        line.label, svm_model_nxtdayactopn.predict(line.features)))

    print(lpreds.collect())
Пример #7
0
def train(sc, file_positive, files_negative, file_output):
    """
    Trains a binary classification model using positive samples in file_positive and
    negative samples in file_negative. It writes the resulting model to file_output

    :param sc: The spark context
    :type sc: SparkContext
    :param file_positive: The file with positive tweets (relevant ones)
    :type file_positive: str
    :param files_negative: The file with negative tweets (non-relevant ones)
    :type files_negative: list[str]
    :param file_output: The output where to store the trained model
    :type file_output: str
    """
    positive_tweets = sc.textFile(file_positive).map(parse_json).filter(is_valid)
    negative_tweets = [sc.textFile(file_negative).map(parse_json).filter(is_valid) for file_negative in files_negative]
    positive = positive_tweets.map(parse_positive)
    negatives = [nt.map(parse_negative) for nt in negative_tweets]
    data = positive
    for negative in negatives:
        data = data.union(negative)

    try:
        print("Training classification model")
        model = SVMWithSGD.train(data, iterations=150, step=1000.0, regType='l1', regParam=1e-7)
        print("Saving classification model to file")
        pickle.dump(model, open(file_output, 'wb'))
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)
Пример #8
0
def train_model(training_data,
                iterations,
                model_file_path,
                calculate_error=True):
    """
    Trains an SVG model and saves it
    :param training_data:
    :param iterations:
    :param model_file_path:
    :return:
    """
    parsed_data = sc.textFile(training_data).map(parse_point)

    # Build the model
    model = SVMWithSGD.train(parsed_data, iterations=iterations)

    # Save the model
    model.save(sc, model_file_path)
    print "Model saved in: ", model_file_path

    if calculate_error:
        #predictions
        labelsAndPreds = parsed_data.map(lambda p:
                                         (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(
            lambda (v, p): v != p).count() / float(parsed_data.count())
        print("============Training Error = " + str(trainErr))
Пример #9
0
def train_svm(points):

    model = SVMWithSGD.train(points, iterations=200)

    # Save and load model
    #model.save(sc, "target3/tmp/pythonSVMWithSGDModel")
    return model
Пример #10
0
def create_model(config, class1, class2):
    # Load training data
    if len(class1) > 0 and len(class2) > 0:
        train_feature_path = config['root_directory'] + config[
            'feature_directory'] + config['train_one_feature_filename']
    else:
        train_feature_path = config['root_directory'] + config[
            'feature_directory'] + config['train_all_feature_filename']
    data = sc.textFile(train_feature_path)
    parsed_data = data.map(make_labeled_point)

    # Build the model
    model = SVMWithSGD.train(parsed_data, iterations=100)

    # Evaluate the model on training data
    labels_and_preds = parsed_data.map(lambda p:
                                       (p.label, model.predict(p.features)))
    train_err = labels_and_preds.filter(
        lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count())
    print("Training Error = " + str(train_err * 100) + "%")

    if len(class1) > 0 and len(class2) > 0:
        model_path = config['root_directory'] + config[
            'one_vs_one_model_directory']
    else:
        model_path = config['root_directory'] + config[
            'one_vs_all_model_directory']

    if os.path.exists(model_path):
        shutil.rmtree(model_path)

    # Save the model
    model.save(sc, model_path)
Пример #11
0
    def train_single_SVM_model(self, dataset):
        """
        Train a single model using SVM (Support Vector Machine) algorithm.

        :param dataset: paper ids used for training
        :return: a SVM model
        """

        Logger.log("train_single_SVM_model")
        if (self.model_training == "imp"):
            # create User Labeled Points needed for the model
            def createUserLabeledPoint(line):
                # peer_paper_id | paper_id | user_id | features | label
                # userId, label, features
                return UserLabeledPoint(int(line[2]), line[4], line[3])

            # convert data points data frame to RDD
            labeled_data_points = dataset.rdd.map(createUserLabeledPoint)
            Logger.log("Number of partitions for labeled data points: " +
                       str(labeled_data_points.getNumPartitions()))
            # Build the model
            lsvcModel = LTRSVMWithSGD().train(labeled_data_points,
                                              intercept=False,
                                              validateData=False)
            return lsvcModel
        if (self.model_training == "cmp"):
            # select only those papers in the training set that are liked by users in the cluster
            cluster_dataset = dataset.join(self.user_clusters, self.userId_col)

            # create User Labeled Points needed for the model
            def createUserLabeledPoint(line):
                # user_id | peer_paper_id | paper_id | features | label | cluster_id |
                # clusterId, label, features
                return UserLabeledPoint(int(line[-1]), line[4], line[3])

            # convert data points data frame to RDD
            labeled_data_points = cluster_dataset.rdd.map(
                createUserLabeledPoint)

            # Build the model
            lsvcModel = LTRSVMWithSGD().train(labeled_data_points,
                                              validateData=False,
                                              intercept=False)
            return lsvcModel
        else:
            # create Label Points needed for the model
            def createLabelPoint(line):
                # label, features
                # paper_id | peer_paper_id | user_id | citeulike_paper_id | features | label
                return LabeledPoint(line[-1], line[-2])

            # convert data points data frame to RDD
            labeled_data_points = dataset.rdd.map(createLabelPoint)
            # Build the model
            lsvcModel = SVMWithSGD().train(labeled_data_points,
                                           validateData=False,
                                           intercept=False)

            return lsvcModel
        Logger.log("Training LTRModel finished.")
Пример #12
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Пример #13
0
def predict_SVMWithSGD(numIterations, step, regParam, regType):
    """
    SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations, default 100
    step: the step parameter used in SGD, default 1.0
    regParam: the regularizer parameter, default 0.01
    miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0
    initialWeights: the initial weights, default None
    regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization)
    intercept: boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias feature are activated or not, default False)
    validateData: boolean parameter which indicates if the algorithm should validate data before training, default True
    convergenceTol: a condition which decides iteration termination, default 0.001
    """
    svmModel = SVMWithSGD.train(scaledData,
                                iterations=numIterations,
                                step=step,
                                regParam=regParam,
                                regType=regType)
    svmMetrics = scaledData.map(lambda p:
                                (svmModel.predict(p.features), p.label))
    svmAccuracy = svmMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    metrics = BinaryClassificationMetrics(svmMetrics)
    #print "SVMWithSGD model accuracy is: %f in %d iterations,step:%f;regParam:%f;regType:%s" % (svmAccuracy, numIterations,step,regParam,regType)
    return svmAccuracy
Пример #14
0
def trainevaluatemodel_svm(traindata,validationdata, iterations, step, minibatchfraction,regparam):
    starttime=time()
    model=SVMWithSGD.train(traindata,iterations=iterations, step=step, regParam=regparam, miniBatchFraction=minibatchfraction, initialWeights=None, regType='l2', intercept=False, validateData=True, convergenceTol=0.001)
    index=evaluation2(model,validationdata)
    duration=time()-starttime
    print('Param:'+'\n'+'iterations:'+str(iterations)+'\n'+'step:'+str(step)+'\n'+'minibatchfraction:'+str(minibatchfraction)+'\n'+'regparam:'+str(regparam)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index))
    return (iterations, step, minibatchfraction,regparam,duration,index)
Пример #15
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Пример #16
0
def modelWithSVM(trainingData, validationData):
    ##Train the model using Support Vector Machines with different values of iterations.
    ##Return the SVM model with best accuracy rate

    #eta = [0.1, 0.3, 0.5, 1.0, 5.0]
    regularizationParamater = [.0000001, 1., 5000., 10000., 200000.]
    bestSVMModel = None
    bestAccuracy = 0
    numOfIterations = 100
    visualizationData = []

    for regularizer in regularizationParamater:

        model = SVMWithSGD.train(trainingData,
                                 numOfIterations,
                                 1.0,
                                 regParam=regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestSVMModel = model

    return bestSVMModel, visualizationData
Пример #17
0
def trainEvaluateModel(trainData, validationData, numIterations, stepSize,
                       regParam):
    '''
    训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。
    我们以图表显示这些参数值、准确率与训练所需的时间。
    我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下:
    (1)用SVMWithSGD.train进行训练传入trainData与单个参数的不同数值;
    (2)建立模型后,用validationData评估模型的AUC准确率;
    (3)训练与评估模型重复执行多次,产生多个参数项的AUC与运行时间,并存储于metricsRDD中;
    (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame;
    (5)Pandas DataFrame可绘制AUC与运行时间图表,用于显示不同参数的准确率与执行时间的关系。
    :param trainData:
    :param validationData:
    :param numIterations:
    :param stepSize:
    :param regParam:
    :return:
    '''
    print('======================= 训练评估模型 =======================')
    startTime = time()
    model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:numIterations=' +
          str(numIterations) + ', stepSize=' + str(stepSize) + ', regParam=' +
          str(regParam) + '\n' + '\t\t==>> 所需时间=' + str(duration) +
          ', 结果AUC=' + str(AUC))
    return (AUC, duration, numIterations, stepSize, regParam, model)
def do_1vsall(class_all, size, num_iter, config):
    features_path = config['protocol'] + config['bucket'] + config['sep'] + config['features_key']
    print('do_1vsall ==============> Setting RDD_ALL')
    rdd_all = sc.textFile(features_path, minPartitions=4).map(lambda line: line.split(',')).persist()
    print('do_1vsall ==============> Setting RDD_TRAIN_SET')
    rdd_train_set = rdd_all.filter(lambda features: int(features[1]) <= size) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    print('do_1vsall ==============> Setting RDD_TEST_SET')
    rdd_test_set = rdd_all.filter(lambda features: size < int(features[1])) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    # Build the model
    model_dir = class_all + '_' + str(size) + '_' + str(num_iter)
    model_s3_file = config['model_key'] + config['sep'] + model_dir
    model = None
    if s3_object_exists(config['bucket'], model_s3_file):
        print('do_1vsall ==============> Loading SVM Model: {}...'.format(model_s3_file))
        model = SVMModel.load(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)
    else:
        print('do_1vsall ==============> Building SVM Model')
        model = SVMWithSGD.train(rdd_train_set, iterations=num_iter)
        print('do_1vsall ==============> Saving SVM Model: {}...'.format(model_s3_file))
        model.save(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)

    # Evaluate the model on th test data
    print('do_1vsall ==============> Evaluating test set')
    labels_and_preds = rdd_test_set.map(lambda p: (p.label, model.predict(p.features)))
    train_err = labels_and_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(rdd_test_set.count())
    # print("Test Error = " + str(train_err))
    success = round(((1 - train_err) * 100), 2)
    print('{},{}'.format(str(size), str(success)))
    return size, success
Пример #19
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Пример #20
0
def model_for_class(cl, dat):
    def adjust_label(lp):
        return LabeledPoint(1 if lp.label == cl else 0, lp.features)

    model = SVMWithSGD.train(dat.map(adjust_label), iterations=10)
    #model.clearThreshold()
    return model
Пример #21
0
def run_iterations(parsedData, iter, seed):
    fp_rates = []
    tp_rates = []
    # thld_arr   = []
    for i in range(0, 10):
        trainingData, testingData = parsedData.randomSplit([70, 30], seed)
        print("For " + str(iter) + " iterations:")
        # Build the model
        model = SVMWithSGD.train(trainingData, iterations=100)

        # Evaluating the model on training data
        labelsAndPreds = trainingData.map(lambda p:
                                          (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(
            lambda (v, p): v != p).count() / float(trainingData.count())
        MSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(
            lambda x, y: x + y) / labelsAndPreds.count()
        print("Training Error = " + str(trainErr))
        print("MSE = " + str(MSE))

        labelsAndPreds = testingData.map(lambda p:
                                         (p.label, model.predict(p.features)))
        testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
            testingData.count())
        MSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(
            lambda x, y: x + y) / labelsAndPreds.count()
        print("Testing Error = " + str(testErr))
        print("MSE = " + str(MSE))

        info = labelsAndPreds.collect()
        actual = [int(i[0]) for i in info]
        predictions = [i[1] for i in info]

        false_positive_rate = labelsAndPreds.filter(
            lambda (v, p): v == 1 and p == 0).count() / float(
                labelsAndPreds.filter(lambda (v, p): v == 1).count())
        true_positive_rate = labelsAndPreds.filter(
            lambda (v, p): v == 0 and p == 0).count() / float(
                labelsAndPreds.filter(lambda (v, p): v == 0).count())
        fpr, tpr, thresholds = roc_curve(actual, predictions)
        # roc_auc = auc(false_positive_rate, true_positive_rate)
        print false_positive_rate
        print true_positive_rate
        fp_rates.append(false_positive_rate)
        tp_rates.append(true_positive_rate)

        print fp_rates
        print tp_rates
        roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fp_rates, tp_rates, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    plt.savefig('fig.png')
Пример #22
0
def trainSVMModel(data):
    """
    Train an SVM model and return it
    :param data: RDD[LabeledPoint]
    :return: svm classification model
    """
    from pyspark.mllib.classification import SVMWithSGD, SVMModel
    model = SVMWithSGD.train(data, iterations=100)
    return model
Пример #23
0
def train_level(docs_with_classes, classification, number_of_terms):
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(
            classification, term_list, classifications, number_of_terms))
    svm = SVMWithSGD.train(training_vectors,
                           iterations=SVM_ITERATIONS,
                           convergenceTol=SVM_CONVERGENCE,
                           regParam=SVM_REG)
    return training_vectors, svm
Пример #24
0
def trainSVMModel(data):
    """
    Train an SVM model and return it
    :param data: RDD[LabeledPoint]
    :return: svm classification model
    """
    from pyspark.mllib.classification import SVMWithSGD, SVMModel
    model = SVMWithSGD.train(data, iterations=100)
    return model
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()

            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)

            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = SVMWithSGD.train(training, iterations=100)
            model_name = "svm" + str(counter_model)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)

            counter_model.add(1)

            end = time.time()
            print("Model Name : ", model_name, ", Total Reviews : ",
                  reviews.count(), "Processing Time : ", (end - start))
def model_per_class(i, labelled_training_data):
    one_against_rest_data = labelled_training_data.map(lambda x: change_label(i, x))
    ones = one_against_rest_data.filter(lambda x: x.label == 1)
    zeros = one_against_rest_data.filter(lambda x: x.label == 0)
    lis = random.sample(range(zeros.count()), ones.count())
    zeros = zeros.zipWithIndex().filter(lambda x: x[1] in lis).map(lambda x: x[0])
    one_against_rest_data = ones.union(zeros)
    model = SVMWithSGD.train(one_against_rest_data, iterations=10000)
    model.clearThreshold()
    return model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize,
                       regParam):
    startTime = time()
    model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:numIterations->", numIterations, ", stepSize->", stepSize,
          ", regParam->", regParam)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, numIterations, stepSize, regParam, model)
Пример #28
0
def svm(trainingData,testData,trainingSize,testSize):
  '''
  linear svm classifier
  '''
  # train a SVM model
  numIterValList = [100,200]
  regParamValList = [0.01,0.1,1,10,100]
  stepSizeValList = [0.1,0.5,1]
  regTypeValList = ['l2','l1']

  # variable for the best parameters
  bestNumIterVal = 200
  bestRegParamVal = 0.01
  bestStepSizeVal = 1
  bestRegTypeVal = 'l2'
  bestTrainErr = 100

  for numIterVal,regParamVal,stepSizeVal,regTypeVal in itertools.product(numIterValList,regParamValList,stepSizeValList,regTypeValList):
    break
    model = SVMWithSGD.train(trainingData, iterations=numIterVal, regParam=regParamVal, step=stepSizeVal, regType=regTypeVal)
    labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize)
    if trainErr<bestTrainErr:
      bestNumIterVal = numIterVal
      bestRegParamVal = regParamVal
      bestStepSizeVal = stepSizeVal
      bestRegTypeVal = regTypeVal
      bestTrainErr = trainErr
    print numIterVal,regParamVal,stepSizeVal,regTypeVal,trainErr
  print bestNumIterVal,bestRegParamVal,bestStepSizeVal,bestRegTypeVal,bestTrainErr

  model = SVMWithSGD.train(trainingData, iterations=bestNumIterVal, regParam=bestRegParamVal, step=bestStepSizeVal, regType=bestRegTypeVal)

  # Evaluating the model on training data
  labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
  trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize)
  print trainErr

  # Evaluating the model on training data
  labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
  testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testSize)
  print testErr
  pass
Пример #29
0
def main():
    # prepare training data
    # RDDTrainData = sc.textFile('2007_100.csv')
    RDDTrainData = sc.textFile(','.join([
        # '1987.csv',
        # '1988.csv',
        # '1989.csv',
        # '1990.csv',
        # '1991.csv',
        # '1992.csv',
        # '1993.csv',
        # '1994.csv',
        # '1995.csv',
        # '1996.csv',
        # '1997.csv',
        # '1998.csv',
        # '1999.csv',
        # '2000.csv',
        # '2001.csv',
        # '2002.csv',
        # '2003.csv',
        # '2004.csv',
        # '2005.csv',
        # '2006.csv',
        '2007.csv',
    ]))
    RDDTrainHeader = RDDTrainData.take(1)[0]
    trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\
                            .map(split)\
                            .map(parseTrain)

    # prepare testing data
    RDDTestData = sc.textFile('2008.csv')
    RDDTestHeader = RDDTestData.take(1)[0]
    testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\
                          .map(split)\
                          .map(parseTest)

    # do prediction

    # SVM
    model = SVMWithSGD.train(trainData, iterations=100)

    # Logistic Regression
    # model = LogisticRegressionWithLBFGS.train(trainData)

    predictionData = testData.map(lambda d:
                                  (int(d.label), model.predict(d.features)))

    # evaluate error rate
    errorCount = predictionData.filter(
        lambda d: int(d[0]) != int(d[1])).count()
    totalCount = predictionData.count()
    print 'error rate =', errorCount, '/', totalCount, '=', float(
        errorCount) / float(totalCount)
Пример #30
0
def svm_train(sc, top_path, stopwords_dict=None):
    #   留个词词典接口,如果有新的词典,把词典放到该目录下
    curpath = os.path.normpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))
    if stopwords_dict is None:
        stopwords = set(
            read_file(os.path.join(curpath, u"stopwords.txt")).split())
    else:
        stopwords = set(
            read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())

    #   形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}

    sub_folder = os.listdir(top_path)
    if len(sub_folder) != 2:
        raise OSError("need and only need two folder")

    top_folder_dict = {}
    for name in sub_folder:
        top_folder_dict[name] = pre_process(os.path.join(top_path, name),
                                            stopwords)

    #   选出两类直接区分度最大的词作为这两类的特征词集
    topk = 500
    features = feature_selection(top_folder_dict[sub_folder[1]],
                                 top_folder_dict[sub_folder[0]], topk)

    #   计算两类的IDF
    IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]],
              features)

    #   每一类每一篇文本在指定二分类下的向量表示[(),()...]
    vector1 = {
        '1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features),
                              IDF)
    }
    vector0 = {
        '0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features),
                              IDF)
    }

    #   转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
    labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
    labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
    train_data = labpoint1 + labpoint0

    classifier = SVMWithSGD.train(sc.parallelize(train_data))

    path = os.path.join(curpath,
                        'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
    if os.path.isfile(path): os.remove(path)

    with open(path, 'wb') as output:
        pickle.dump((features, IDF, classifier), output)
Пример #31
0
def SVM_train(data):
    data_train = split_data(data)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_SVM = SVMWithSGD.train(training, 10)
    predictionAndlabel = test.map(
        lambda x: (float(model_SVM.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_SVM:%f" % accuracy)
    return model_SVM, accuracy
Пример #32
0
def gen_predictors(training_data):
    classifiers = dict()
    for item in label_map.iteritems():
        print "Gen predictor for label '{0}' ...".format(item[0])

        global processed_label
        processed_label = item[1]
        svm = SVMWithSGD.train(training_data.map(transform_label))
        classifiers[item[1]] = svm

    return classifiers
Пример #33
0
def main_spark(sc, trainData, testData, outputFilename):
    # Load and process data
    dataProcessing = sc.textFile(trainData) \
        .map(parseTrainData)
    # Load test data
    testDataLoad = sc.textFile(testData)\
        .map(parseTestData)

    # Build svm model
    #model = SVMWithSGD.train(dataProcessing, iterations=100, step=1.0, regParam=0.01)
	model = SVMWithSGD.train(dataProcessing, iterations=50, step=1.0, regParam=0.01, miniBatchFraction=20.0)
Пример #34
0
def main():
    # prepare training data
    # RDDTrainData = sc.textFile('2007_100.csv')
    RDDTrainData = sc.textFile(','.join([
        # '1987.csv',
        # '1988.csv',
        # '1989.csv',
        # '1990.csv',
        # '1991.csv',
        # '1992.csv',
        # '1993.csv',
        # '1994.csv',
        # '1995.csv',
        # '1996.csv',
        # '1997.csv',
        # '1998.csv',
        # '1999.csv',
        # '2000.csv',
        # '2001.csv',
        # '2002.csv',
        # '2003.csv',
        # '2004.csv',
        # '2005.csv',
        # '2006.csv',
        '2007.csv',
    ]))
    RDDTrainHeader = RDDTrainData.take(1)[0]
    trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\
                            .map(split)\
                            .map(parseTrain)

    # prepare testing data
    RDDTestData = sc.textFile('2008.csv')
    RDDTestHeader = RDDTestData.take(1)[0]
    testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\
                          .map(split)\
                          .map(parseTest)

    # do prediction

    # SVM
    model = SVMWithSGD.train(trainData, iterations=100)

    # Logistic Regression
    # model = LogisticRegressionWithLBFGS.train(trainData)

    predictionData = testData.map(lambda d:
        (int(d.label), model.predict(d.features))
    )

    # evaluate error rate
    errorCount = predictionData.filter(lambda d: int(d[0]) != int(d[1])).count()
    totalCount = predictionData.count()
    print 'error rate =', errorCount, '/', totalCount, '=', float(errorCount) / float(totalCount)
Пример #35
0
def train_evaluate_model(train_data, valid_data, iterations, step, regParam):
    start_time = time()
    # 训练
    model = SVMWithSGD.train(
        train_data, iterations=iterations, step=step, regParam=regParam)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估:使用参数 step={step}, iterations={iterations}, regParam={regParam} ==>所需时间={duration} 结果AUC = {AUC}")
    return AUC, duration, iterations, step, regParam, model
Пример #36
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #37
0
def mySVM(training, test):
    # SVM
    training_svc = training.map(lambda x: LabeledPoint(x[29], x[1:28]))
    sv = SVMWithSGD.train(training_svc,
                          iterations=100,
                          step=0.1,
                          regParam=0.01)
    test_svc = test.map(lambda x: LabeledPoint(x[29], x[1:28]))
    predictions = test_svc.map(lambda x:
                               (x.TARGET, float(sv.predict(x.features))))
    return predictions
Пример #38
0
def SVM_module(training):
  """This function returns a SVM model from your training data.

  :param training: (REQUIRED) - the training data
  :return: SVM model

  Use it as (Be sure to call split_data() to get the training data):

  >>> model = SVM_module(trainingData)
  """
  # Train a SVM model
  return SVMWithSGD.train(training, iterations=300)
Пример #39
0
def run_iterations(parsedData, iter, seed):
    fp_rates = []
    tp_rates = []
    # thld_arr   = []
    for i in range(0, 10):
        trainingData, testingData = parsedData.randomSplit([70, 30], seed)
        print("For " + str(iter) + " iterations:")
        # Build the model
        model = SVMWithSGD.train(trainingData, iterations=100)

        # Evaluating the model on training data
        labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count())
        MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
        print("Training Error = " + str(trainErr))
        print("MSE = " + str(MSE))

        labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features)))
        testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testingData.count())
        MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
        print("Testing Error = " + str(testErr))
        print("MSE = " + str(MSE))



        info = labelsAndPreds.collect()
        actual = [int(i[0]) for i in info]
        predictions = [i[1] for i in info]

        false_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 1 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 1).count())
        true_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 0 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 0).count())
        fpr, tpr, thresholds = roc_curve(actual, predictions)
        # roc_auc = auc(false_positive_rate, true_positive_rate)
        print false_positive_rate
        print true_positive_rate
        fp_rates.append(false_positive_rate)
        tp_rates.append(true_positive_rate)


        print fp_rates
        print tp_rates
        roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fp_rates, tp_rates, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    plt.savefig('fig.png')
Пример #40
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #41
0
def main(sc):
    train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
    data=sc.textFile(train_data).map(parse)
    
    if os.path.exists('model'):
        model=SVMModel.load(sc, 'model')
    else:
        model=SVMWithSGD.train(data, iterations=100)
        model.save(sc, 'model')

    labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))

    # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
    # print('Training Error ='  + str(trainErr))

    labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
Пример #42
0
def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	svmModel = SVMWithSGD.train(labeled_points_training)
	return lrModel, nbModel, svmModel, labeled_points_test, features
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
Пример #44
0
def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]
    
    data = sc.textFile(inputFile)
    parsedData = data.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, modelPath)
    # sameModel = SVMModel.load(sc, "svm_model")
    sc.stop()
Пример #45
0
def svm_train(sc, top_path, stopwords_dict=None):
    #   留个词词典接口,如果有新的词典,把词典放到该目录下
    curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    if stopwords_dict is None:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords.txt")).split())
    else:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())

    #   形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}

    sub_folder = os.listdir(top_path)
    if len(sub_folder) != 2:
        raise OSError("need and only need two folder")

    top_folder_dict = {}
    for name in sub_folder:
        top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords)

    #   选出两类直接区分度最大的词作为这两类的特征词集
    topk = 500
    features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk)

    #   计算两类的IDF
    IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features)

    #   每一类每一篇文本在指定二分类下的向量表示[(),()...]
    vector1 = {'1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF)}
    vector0 = {'0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF)}

    #   转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
    labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
    labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
    train_data = labpoint1 + labpoint0

    classifier = SVMWithSGD.train(sc.parallelize(train_data))

    path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
    if os.path.isfile(path): os.remove(path)

    with open(path, 'wb') as output:
        pickle.dump((features, IDF, classifier), output)
Пример #46
0
def train_model(training_data, iterations, model_file_path, calculate_error=True):
    """
    Trains an SVG model and saves it
    :param training_data:
    :param iterations:
    :param model_file_path:
    :return:
    """
    parsed_data = sc.textFile(training_data).map(parse_point)

    # Build the model
    model = SVMWithSGD.train(parsed_data, iterations=iterations)

    # Save the model
    model.save(sc, model_file_path)
    print "Model saved in: ", model_file_path

    if calculate_error:
        #predictions
        labelsAndPreds = parsed_data.map(lambda p: (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsed_data.count())
        print("============Training Error = " + str(trainErr))
Пример #47
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Пример #48
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Пример #49
0
 def train(self, num_iterations=10):
     # TODO support all the keyword training params
     model = SVMWithSGD.train(self._labeled_feature_vector_rdd(), num_iterations)
     return SVMModel(model, self.feature_cols)
Пример #50
0
import sys

def toLabeledPoints(sc, data):
    return sc.parallelize(data).map(lambda x: LabeledPoint(x[0], x[1]))

def loadData(path):
    data_file = open(path,"r")
    return pickle.load(data_file)

def computeError(m, d):
    labelsAndPreds = d.map(lambda p: (p.label, m.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v,p): v != p).count() / float(d.count())
    return trainErr

if __name__ == "__main__":
    conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    data = toLabeledPoints(sc, loadData(sys.argv[1]))
    testData = toLabeledPoints(sc, loadData(sys.argv[2]))

    # Train the model with different regularization parameters
    results = []
    for i in [0.00, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]:
        model = SVMWithSGD.train(data, step=0.05, regParam=i)
        results.append((i, computeError(model, testData)))

    outfile = open("results.txt","w")
    outfile.write(str(results))
    outfile.close()
Err = 0.0
results = []
for train_index, test_index in ss:
	X_training, Y_training, X_test, Y_test = [], [], [], []
	for i in train_index:
		X_training.append(X[i])
		Y_training.append(Y[i])
	for i in test_index:
		X_test.append(X[i])
		Y_test.append(Y[i])
		
	parsedData = []
	for i in range(0, len(X_training)):
		parsedData.append(LabeledPoint(Y_training[i], X_training[i]))
		
	model = SVMWithSGD.train(sc.parallelize(parsedData))
		
	testErr = 0
	for i in range(0, len(X_test)):
		a = Y_test[i]
		b = model.predict(X_test[i])
		if a != b:
			testErr += 1
		
	Err += float(testErr) / float(len(X_test))

	 
print ("AVG test error: %.6f" % 
	(Err/iter_number))

Пример #52
0
def svmTest(sqlContext,dataset_rdd,positive_negotive_rate):
	dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
	dataset_negotive =  dataset_rdd.filter(lambda e:e[1]<0.5)
    train_positive = dataset_positive.sample(False,0.8)
    test_positive = dataset_positive.subtract(train_positive)
    train_negotive = dataset_negotive.sample(False,0.8)
    test_negotive = dataset_negotive.subtract(train_negotive)
    trainset_rdd = train_positive.union(train_negotive)
    testset_rdd = test_positive.union(test_negotive)
	trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	trainset_nums = trainset.count()
	testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	testset_nums = testset.count()
    trainset_positive = train_positive.count()
    testset_positive = test_positive.count()
	model = SVMWithSGD.train(trainset,iterations = 100)
	predict = testset.map(lambda p:(p.label,model.predict(p.features)))
	hitALL =predict.filter(lambda e:e[0]==e[1]).count()
	hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
	positive = predict.filter(lambda e:e[1]>0.5).count()
	recallPositive = hitPositive/float(testset_positive)
	precision = hitPositive/float(positive)
	accuracy = hitALL/float(testset.count())
	F_Value = 2/(1/precision+1/recallPositive)
	return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
def processData(sqlContext):
    dataset_label_gender = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/labels_gender*')			
	imeis_ads = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_ads*')
	imeis_aboutTimes = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_about*')
    imeis_apps = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_apps*')
    imeis_prvs = sqlContext.parquetFile('/hadoop/hadoop_/wxq/test0405/imeis_prvs*')
Пример #53
0
train_dict = [i.asDict() for i in feats_train]

feats_test = test.collect()
test_dict = [i.asDict() for i in feats_test]

def parsePoint(d):
    d_copy = deepcopy(d) # I hate using deepcopy so much
    pred = d_copy['success_class']
    d.pop('success_class', None)
    values = [float(x) for x in d.values()]
    return LabeledPoint(pred, map(float,values))

trainParsed = sc.parallelize(map(parsePoint, train_dict))
testParsed = sc.parallelize(map(parsePoint, test_dict))

model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR
Пример #54
0
	values = [float(x) for x in clean_line_split]
	if values[7] == 0:
                        values[7]=1;
        else:
                        values[7]=0;

	return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMwide00-08train")
sameModel = SVMModel.load(sc, "SVMwide00-08train")
Пример #55
0
NB_percent = []
LRSGD_percent = []
LRLBFGS_percent = []

for i in topFeatures:
	parsedData = sortedData.map(lambda line : (line, i)).map(labelData)
	splits = parsedData.randomSplit((0.9, 0.1))
	train_set = splits[0]
	train_set.cache()
	test_set = splits[1]
	test_set.cache()
	#NBmodel = NaiveBayes.train(train_set)
	#NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	#findCoveragePercent(NB_socredLabel, 0.4)
	SVMSGDmodel = SVMWithSGD.train(train_set)
	SVMSGDmodel.clearThreshold()
	SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0))
	LRSGDmodel = LogisticRegressionWithSGD.train(train_set)	
	LRSGDmodel.clearThreshold()
	LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0))
	LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set)
	LRLBFGSmodel.clearThreshold()
	LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
# the number of features is the columns of the matrix
#we need this information to convert to vectors and label point the coordinate data
cols=sc.broadcast(len(m.get_feature_names())) 
print "number of features"+str(cols.value)

#convert to labeled point in parallel
tmpLB=tmp.map(partial(toLB,cols=cols,class_v=bY)) 

print "splitting the data"
train, test = tmpLB.randomSplit([0.6, 0.4], seed = 0)
print "training the machine learning algorithm"
#Change ---------------------------------
#model = NaiveBayes.train(train, 1.0)

### Change DONE
model=SVMWithSGD.train(train, 1.0) 
### Change XCA
# TODO We are testing several MLs
# 1) LogisticsRegression
#model =LogisticRegressionwWithSGD.train(train)   This is used for Logistic regression classification

# 2) SVM Classification 
#model=SVMWithSGD.train(train)  This used for SVM classiffication

# 3) RandomForest 
#************Random forest model in pyspark is experimental so not sure whether works perfectly or not
#model=RandomForest.trainClassifier(train,2,{},300,seed=2)  here 300 is best solution as per literature for this dataset

print "retrieving predictions and evaluating"
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
# Build the model
logitmodel = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, logitmodel.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))  ## 0.353992330848

############################ SVM ##############################

from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Build the model
SVMmodel = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr)) ## 0.555395278766

############################ Decision TREE ##############################

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils


def parsePoint(line):
    values = [float(x) for x in line.split(',')]
Пример #58
0
rows_num = float(sorted_labelled.count())


precisions = []
recalls = []
recallNum = []
sum = 0.0
model_start = time.time()

for label in random_labels:
	parsedData = sorted_labelled.map(lambda line : (line, label)).map(labelData)
	splits = parsedData.randomSplit((0.9, 0.1))
	train_set = splits[0]
	test_set = splits[1]
	test_set.cache()
	model = SVMWithSGD.train(train_set)
	#model = LogisticRegressionWithSGD.train(train_set)
	#model = LogisticRegressionWithLBFGS.train(train_set)
	#model = DecisionTree.trainClassifier(train_set, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32)
	#model = RandomForest.trainClassifier(train_set, numClasses=2, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='gini', maxDepth=3, maxBins=32)
	#labelsAndPreds = test_set.map(lambda p: (p.label, model.predict(p.features)))
	#testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test_set.count())
	predictions = model.predict(test_set.map(lambda x: x.features))
	labelsAndPredictions = test_set.map(lambda lp: lp.label).zip(predictions)
	labelsAndPredictions.cache()
	precision = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(test_set.count())
	if labelsAndPredictions.filter(lambda (v, p): v == 1).count() != 0:
		recall = labelsAndPredictions.filter(lambda (v, p): v == p and v == 1).count() / float(labelsAndPredictions.filter(lambda (v, p): v == 1).count())
		recallNum.append(labelsAndPredictions.filter(lambda (v, p): v == 1).count())
	else:
		recall = 1.0
Пример #59
0
    items = line.strip().split()
    y = items[0]
    x = items[1:]
    # return LabeledPoint(y, x)
## this explicitly maps each example to a higher dimensional space
## namely the space of a degree 2 polynomial kernel
    poly = npp.Polynomial([float(_) for _ in x])
    return LabeledPoint(y, (poly*poly).coef)

## load data and prep for SVM
data = sc.textFile("all_hands.txt")
examples = sc.parallelize(data.map(get_labeled_point).collect())

results = {}

## train SVMs with different regularization parameters
for exponent in range(5,11,2):
    model = SVMWithSGD.train(examples,
                             iterations=50,
                             regParam=2 ** exponent,
                             miniBatchFraction=1,
                             step=1)

## compute training error for that regParam
    incorrect_predictions = examples.map(lambda p: p.label != model.predict(p.features))
    training_error = incorrect_predictions.filter(lambda p : p).count() / float(examples.count())
    print "Training Error: %s" % training_error

    results[2**exponent] = training_error

print results