예제 #1
0
    def train_single_SVM_model(self, dataset):
        """
        Train a single model using SVM (Support Vector Machine) algorithm.

        :param dataset: paper ids used for training
        :return: a SVM model
        """

        Logger.log("train_single_SVM_model")
        if (self.model_training == "imp"):
            # create User Labeled Points needed for the model
            def createUserLabeledPoint(line):
                # peer_paper_id | paper_id | user_id | features | label
                # userId, label, features
                return UserLabeledPoint(int(line[2]), line[4], line[3])

            # convert data points data frame to RDD
            labeled_data_points = dataset.rdd.map(createUserLabeledPoint)
            Logger.log("Number of partitions for labeled data points: " +
                       str(labeled_data_points.getNumPartitions()))
            # Build the model
            lsvcModel = LTRSVMWithSGD().train(labeled_data_points,
                                              intercept=False,
                                              validateData=False)
            return lsvcModel
        if (self.model_training == "cmp"):
            # select only those papers in the training set that are liked by users in the cluster
            cluster_dataset = dataset.join(self.user_clusters, self.userId_col)

            # create User Labeled Points needed for the model
            def createUserLabeledPoint(line):
                # user_id | peer_paper_id | paper_id | features | label | cluster_id |
                # clusterId, label, features
                return UserLabeledPoint(int(line[-1]), line[4], line[3])

            # convert data points data frame to RDD
            labeled_data_points = cluster_dataset.rdd.map(
                createUserLabeledPoint)

            # Build the model
            lsvcModel = LTRSVMWithSGD().train(labeled_data_points,
                                              validateData=False,
                                              intercept=False)
            return lsvcModel
        else:
            # create Label Points needed for the model
            def createLabelPoint(line):
                # label, features
                # paper_id | peer_paper_id | user_id | citeulike_paper_id | features | label
                return LabeledPoint(line[-1], line[-2])

            # convert data points data frame to RDD
            labeled_data_points = dataset.rdd.map(createLabelPoint)
            # Build the model
            lsvcModel = SVMWithSGD().train(labeled_data_points,
                                           validateData=False,
                                           intercept=False)

            return lsvcModel
        Logger.log("Training LTRModel finished.")
예제 #2
0
hamExample = tf.transform("Spark is really good at big data processing".split(" "))

print(model.predict(spamExample))
print(model.predict(hamExample))


from pyspark.mllib.classification import LogisticRegressionWithLBFGS

algo = LogisticRegressionWithLBFGS()
model = algo.train(training_data)
score(model)

#### Support Vector Machines
#### What about SVMs, another popular algorithm?
from pyspark.mllib.classification import SVMWithSGD
algo = SVMWithSGD()
model = algo.train(training_data)
score(model)


##### Trees
#####
##### Now let’s try three variants of tree-based classification. 
##### The API is slightly different from previous algos.
from pyspark.mllib.tree import DecisionTree

from pyspark.mllib.tree import GradientBoostedTrees

from  pyspark.mllib.tree import RandomForest

algo = DecisionTree()
예제 #3
0
[training_data, test_data] = samples.randomSplit([0.8, 0.2])
training_data.cache()
test_data.cache()

algorithm = LogisticRegressionWithSGD()
model = algorithm.train(training_data)
print('logistic regression sgd:', score(model))

algorithm = LogisticRegressionWithLBFGS()
model = algorithm.train(training_data)
print('logistic regression with lbfgs:', score(model))

# algorithm = DecisionTree()
# model = algorithm.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={})
# print('decision tree: ',score(model))
#
# algorithm = RandomForest()
# model = algorithm.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
# print('random forest: ',score(model))

algorithm = NaiveBayes()
model = algorithm.train(training_data)
print('naive bayes: ', score(model))

algorithm = SVMWithSGD()
model = algorithm.train(training_data, iterations=10)
print('svm with sgd: ', score(model))
# model.save(sc,"classifierModelPLOS")
if __name__ == "__main__":
    pass
예제 #4
0
idf = IDF().fit(tf)
tfidf = idf.transform(tf).collect()
#data2 =tfidf.collect()

# a = lists.collect()
# b = tfidf.collect()
# print "type tfidf {} len {}".format(type(b),len(b))
# c=b[0]
# print "type c {} len {}".format(type(c),len(c))

## cross-validaton/Grid-search:
cv = ShuffleSplit(len(tfidf), n_iter=3, test_size=0.3, random_state=42)

nb = NaiveBayes()
lr = LogisticRegressionWithLBFGS()
svm = SVMWithSGD()
models = [lr, nb, svm]
scores = {model.__name__: [] for model in models}
grids = [ParamGridBuilder()\
.baseOn({lr.labelCol: 'l'})\
.baseOn([lr.predictionCol, 'p'])\
.addGrid(lr.regParam, [1.0, 2.0]) \
.addGrid(lr.maxIter, [0, 1]).build(),\
ParamGridBuilder()\
.addGrid(nb.lambda_, [0, 1]) \
.addGrid(nb.maxIter, [0, 1]).build() ,\
ParamGridBuilder()\
.baseOn({svm.labelCol: 'l'})\
.baseOn([svm.predictionCol, 'p'])\
.addGrid(svm.regParam, [1.0, 2.0]) \
.addGrid(svm.maxIter, [0, 1]).build()
예제 #5
0
###标准化

stdscaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
scaledFeature = stdscaler.transform(featureRDD)
labelPoint = labelRDD.zip(scaledFeature)
labelPointRDD = labelPoint.map(lambda x:LabeledPoint(x[0],x[1]))

#model
model = LogisticRegressionWithSGD.train(labelPointRDD,num_iter,learning_rate,batch_size)
	

	
# svm
from pyspark.mllib.classification import SVMWithSGD

model = SVMWithSGD(trainData,num_iter,learning_rate,regParam)

#naiveBayes
from pyspark.mllib.classification import NaiveBayes
model = NaiveBayes.train(trainData,lambdaParam)





#DataFrame

sqlContext = SparkSession.builder.getOrCreate()
from pyspark.sql import Row
userRDD = rawUserRDD.map(x=>x.split("|"))
user_row = userRDD.map(lambda p:Row(userid=int(p[0]),age=int(p[1]),gender=p[2])) # schema
예제 #6
0
    return LabeledPoint(label, Vectors.dense(features))


nb_data = records.map(lambda r: labeled_point_nb(r))
print("the first data of nb data and the count of nb data:")
print(nb_data.first())

#start train model
num_iterations = 10
max_tree_depth = 5

lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations)
print("logistic regression model :")
print(lr_model)

svm_model = SVMWithSGD().train(data, num_iterations)
print("svm model :")
print(svm_model)

nb_model = NaiveBayes().train(nb_data)
print("naive bayes model :")
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)

#start predict
data_point = data.first()
lr_prediction = lr_model.predict(data_point.features)
print("logistic model prediction :" + str(lr_prediction))
예제 #7
0
파일: model.py 프로젝트: annelai/ML_pyspark
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD
from pyspark.ml.tuning import ParamGridBuilder

model_list = {
    "Linear-SVM": SVMWithSGD(),
    "LogisticRegression": LogisticRegressionWithSGD()
}

params_list = {
    "LogisticRegression" : ParamGridBuilder()\
    .addGrid(LogisticRegressionWithSGD.regParam, [0.1, 0.01]) \
    .addGrid(LogisticRegressionWithSGD.step, [0.1, 0.01])\
    .addGrid(LogisticRegressionWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\
    .addGrid(LogisticRegressionWithSGD.regType, ['l1', 'l2', None])\
    .addGrid(LogisticRegressionWithSGD.convergenceTol, [0.001, 0.0001])
    .build(),
    "Linear-SVM": ParamGridBuilder()\
    .addGrid(SVMWithSGD.regParam, [0.1, 0.01]) \
    .addGrid(SVMWithSGD.step, [0.1, 0.01])\
    .addGrid(SVMWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\
    .addGrid(SVMWithSGD.regType, ['l1', 'l2', None])\
    .addGrid(SVMWithSGD.convergenceTol, [0.001, 0.0001])
    .build()
}

if __name__ == "__main__":
    # SVMWithSGD.train(training_data, estimatorParamMaps=grid)
    # SVMWithSGD.predict(test_data)

    # LogisticRegressionWithSGD.train(data, estimatorParamMaps=grid)
    # LogisticRegressionWithSGD.predict(test_data)