예제 #1
0
train_binary_classification = pd.concat(
    [train["id"], train["comment_text"], train["clean"]], axis=1)

tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
#Remove stopwords
remover = StopWordsRemover().setInputCol("words").setOutputCol(
    "filtered").setCaseSensitive(False)
# ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
#For each sentence (bag of words),use HashingTF to hash the sentence into a feature vector.
hashingTF = HashingTF().setNumFeatures(1000).setInputCol(
    "filtered").setOutputCol("rawFeatures")
#Create TF_IDF features
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(
    0)
# Create a Logistic regression model
lr = LinearSVC(labelCol="label", featuresCol="features", maxIter=20)
# Streamline all above steps into a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

#Clean=1/toxic=0
test = []
for i in test_data["class"]:
    if (i == 2):
        test.append(1)
    else:
        test.append(0)

rebalanceDatasetTechnique = ["undersampling"]  #,"oversampling","no technique"]
for technique in rebalanceDatasetTechnique:
    print('**Processing {} on imbalanced data...**'.format(technique))
    df = spark.createDataFrame(train_binary_classification)
lr = LogisticRegression(featuresCol='scaledFeatures',
                        maxIter=100,
                        regParam=0.3,
                        elasticNetParam=0.8,
                        tol=0.0001,
                        family="binomial")
dt = DecisionTreeClassifier(featuresCol='scaledFeatures', seed=seed)
rf = RandomForestClassifier(featuresCol='scaledFeatures',
                            seed=seed,
                            numTrees=20)
GBDT = GBTClassifier(featuresCol='scaledFeatures', seed=seed)
layers = [feature_number, 10, 5, 2]
mlp = MultilayerPerceptronClassifier(featuresCol='scaledFeatures',
                                     layers=layers,
                                     seed=seed)
svm = LinearSVC(featuresCol='scaledFeatures', regParam=0.1)
nb = NaiveBayes(featuresCol='scaledFeatures', smoothing=1.0)

times = []


#model training and testing functions
def LR(trainingData, testData):

    start = time.time()
    Model = lr.fit(trainingData)
    end = time.time()
    times.append(end - start)

    results = Model.transform(testData)
예제 #3
0
    # Valor de precision sobre el conjunto de testeo
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    f = open("/tmp/data/ml_data/nbModel_2_accuracy.txt", "w+")
    f.write("Test Accuracy: " + str(evaluator.evaluate(predictions)))

    # Valor de la curva ROC sobre el conjunto de testeo
    evaluator = BinaryClassificationEvaluator()
    f = open("/tmp/data/ml_data/nbModel_2_test_set_area_under_ROC.txt", "w+")
    f.write("Test set Area Under ROC: " + str(evaluator.evaluate(predictions)))

    ### Linear support vector machine  1
    # Entrenar
    lsvc = LinearSVC(maxIter=10, regParam=0.1)
    lsvcModel_1 = lsvc.fit(train_set)

    predictions = lsvcModel_1.transform(test_set)
    # Valor de precision sobre el conjunto de testeo
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    f = open("/tmp/data/ml_data/lsvcModel_1_accuracy.txt", "w+")
    f.write("Test Accuracy: " + str(evaluator.evaluate(predictions)))

    # Valor de la curva ROC sobre el conjunto de testeo
    evaluator = BinaryClassificationEvaluator()
    f = open("/tmp/data/ml_data/lsvcModel_1_test_set_area_under_ROC.txt", "w+")
    f.write("Test set Area Under ROC: " + str(evaluator.evaluate(predictions)))
예제 #4
0
pca_model = pca.fit(standardized_features_df70)
pca_train = pca_model.transform(standardized_features_df70)

logger.error("###### pca on standarded scaler using test")

pca = PCA(k=2, inputCol="std_features", outputCol="pca_features")
pca_model = pca.fit(standardized_features_df30)
pca_test = pca_model.transform(standardized_features_df30)

logger.error("############# svm")

from pyspark.ml.classification import LinearSVC

# Define your classifier
lsvc = LinearSVC(maxIter=30,
                 regParam=0.1,
                 featuresCol="pca_features",
                 labelCol="label")

stages209 = []
#stages += string_indexer
#stages += one_hot_encoder
#stages209 += [vector_assembler]
#stages209 += [minmax]
stages209 += [lsvc]

from pyspark.ml import Pipeline

pipeline209 = Pipeline().setStages(stages209)
svm7_model209 = pipeline209.fit(pca_train)

svm7_pp_df209 = svm7_model209.transform(pca_test)
예제 #5
0
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed')
input_cols=['intl_plan_indexed'] + reduced_numeric_cols


#Feature Vector Assembler
assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features')

#Standard Scaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False)

#Configure Random Forest Classifier Model 
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC

#svmclassifier = LinearSVC(labelCol = 'label', featuresCol = 'scaledFeatures')
svmclassifier = LinearSVC(labelCol = 'label', featuresCol = 'features')

#Set Random Forest Pipeline Stages
#pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, scaler, svmclassifier])
pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, svmclassifier])


#Spilt Test and Train Sets
(train, test) = churn_data.randomSplit([0.75, 0.25])

#Spark Model Hyper Turning
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
예제 #6
0
)
label_Idxstr = IndexToString(
    inputCol="prediction",
    outputCol="predicted_class",
    labels=["False", "True"],
)

# Text Vectorization

hashTF = HashingTF(inputCol="token_features", outputCol="tf_features")
idf = IDF(inputCol="tf_features", outputCol="features", minDocFreq=2)

# Classification Models

mnb_clf = NaiveBayes(smoothing=1.0)
svm_clf = LinearSVC(standardization=False)

#  Loading Everything to Pipeline

pipeline = Pipeline().setStages([
    document_assembler,
    sentence,
    tokenizer,
    normalizer,
    lemmatizer,
    stopwords_cleaner,
    finisher,
    hashTF,
    idf,
    label_strIdx,
    svm_clf,
예제 #7
0
old_columns_names = df.columns
print(old_columns_names)
new_columns_names = [name + '-new' for name in old_columns_names]
for i in range(len(old_columns_names)):
    indexer = StringIndexer(inputCol=old_columns_names[i],
                            outputCol=new_columns_names[i])
    df = indexer.fit(df).transform(df)
vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features')
df = vecAss.transform(df)
# 更换label列名
df = df.withColumnRenamed(new_columns_names[0], 'label')

# 创建新的只有label和features的表
data = df.select(['label', 'features'])

# 数据概观
print(data.show(5, truncate=0))

# 将数据集分为训练集和测试集
train_data, test_data = data.randomSplit([4.0, 1.0], 100)

from pyspark.ml.classification import LinearSVC
svm = LinearSVC()
svmModel = svm.fit(train_data)
result = svmModel.transform(test_data)

# accuracy
print(
    result.filter(result.label == result.prediction).count() / result.count())
# 0.9797172710510141
models = cv.fit(ngramDataFrame)

result = models.transform(ngramDataFrame)

result1 = result.select("business_id","text","stars","label","features","ngrams")

idf = IDF(inputCol="features", outputCol="tdfeatures")
idfModel = idf.fit(result1)
rescaledData = idfModel.transform(result1)

testing = rescaledData.select("business_id","text","stars","label","tdfeatures","ngrams").withColumnRenamed("tdfeatures","features")
testing = testing.withColumn("label", testing["label"].cast(IntegerType()))


svm = LinearSVC()
model = svm.fit(testing)
coeffs = model.coefficients


vocabulary_ngram = models.vocabulary
weights_ngram = coeffs.toArray()
svm_coeffs_df_ngram = pd.DataFrame({'ngram': vocabulary_ngram, 'weight': weights_ngram})

sql = SQLContext(sc)

result = sql.createDataFrame(svm_coeffs_df_ngram)

result.coalesce(1).write.csv('bdad_dataset/output/twogramfeatures_'+top_bid)

rf = RandomForestClassifier(labelCol="CANCELLED", featuresCol="features")
rfModel = rf.fit(train)
predictions_rf = rfModel.transform(test)
accuracy_rf = evaluator.evaluate(predictions_rf)

# Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing = 1.0, modelType = "multinomial", featuresCol = "features", labelCol = "CANCELLED")
nbModel = nb.fit(train)
predictions_nb = nbModel.transform(test)
accuracy_nb = evaluator.evaluate(predictions_nb)
# 0.59431219823991344

# SVM - tried but didn't work
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1, featuresCol = ‘features’, labelCol = ‘CANCELLED’)
lsvcModel = lsvc.fit(train)
predictions_svm = lsvcModel.transform(test)
accuracy_svm = evaluator.evaluate(predictions_svm)

# Plotting accuracies for all models
import matplotlib.pyplot as plt
A = ['Logistic Regression','Decision Tree','Random Forest','Naive Bayes']
B = [accuracy_lr, accuracy_dt, accuracy_rf, accuracy_nb]

fig = plt.figure()
ax = fig.add_subplot(111)
plt.scatter(A, B)
axes = plt.gca()
axes.set_ylim([0,100])
예제 #10
0
파일: zadanie.py 프로젝트: kmbee/TSVD
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    print("------------------------------------------------")
#detekovanie anomalii
for center in centers:
    for point in center:
        if (point > 5 or -5 > point):
            print "anomalia: {0:.15f}".format(point)

#SVM
print "-------------------------------------------------"
print "-----------------------SVM-----------------------"
print "-------------------------------------------------"
svm_classifier = LinearSVC(featuresCol="features",
                           labelCol="Accident_Severity")
svm_model = svm_classifier.fit(training_data)
predictions = svm_model.transform(test_data)
test_error = predictions.filter(
    predictions["prediction"] != predictions["Accident_Severity"]).count(
    ) / float(test_data.count())
print "Testing error: {0:.4f}".format(test_error)
#kontingencna tabulka SVM
cf = predictions.crosstab("prediction", "Accident_Severity")
cf.show()
#vyhodnotenie SVM
evaluatorMulti = MulticlassClassificationEvaluator(
    labelCol="Accident_Severity", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(labelCol="Accident_Severity",
                                          rawPredictionCol="prediction",
                                          metricName='areaUnderROC')
예제 #11
0
                  labelCol='label_0',
                  predictionCol='nb_pred_0',
                  probabilityCol='nb_prob_0',
                  rawPredictionCol='nb_raw_0')
nb_1 = NaiveBayes(featuresCol='features',
                  labelCol='label_1',
                  predictionCol='nb_pred_1',
                  probabilityCol='nb_prob_1',
                  rawPredictionCol='nb_raw_1')
nb_2 = NaiveBayes(featuresCol='features',
                  labelCol='label_2',
                  predictionCol='nb_pred_2',
                  probabilityCol='nb_prob_2',
                  rawPredictionCol='nb_raw_2')
svm_0 = LinearSVC(featuresCol='features',
                  labelCol='label_0',
                  predictionCol='svm_pred_0',
                  rawPredictionCol='svm_raw_0')
svm_1 = LinearSVC(featuresCol='features',
                  labelCol='label_1',
                  predictionCol='svm_pred_1',
                  rawPredictionCol='svm_raw_1')
svm_2 = LinearSVC(featuresCol='features',
                  labelCol='label_2',
                  predictionCol='svm_pred_2',
                  rawPredictionCol='svm_raw_2')

# build pipeline to generate predictions from base classifiers, will be used in task 1.3
gen_base_pred_pipeline = Pipeline(
    stages=[nb_0, nb_1, nb_2, svm_0, svm_1, svm_2])
gen_base_pred_pipeline_model = gen_base_pred_pipeline.fit(training_set)
예제 #12
0
# COMMAND ----------

nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

# COMMAND ----------

# MAGIC %md 
# MAGIC ###### Support Vector Machine

# COMMAND ----------

from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="Survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "Survived", "features").show()


# COMMAND ----------

# MAGIC %md 
# MAGIC ###### Evaluating the accuracy of Support Vector Machine.

# COMMAND ----------

svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))
예제 #13
0
eval_metrics = lr_model.avgMetrics

param_res = []

for params, metric in zip(param_maps, eval_metrics):
    param_metric = {}
    for key, param_val in zip(params.keys(), params.values()):
        param_metric[key.name] = param_val
    param_res.append((param_metric, metric))

sorted(param_res, key=lambda x: x[1], reverse=True)

# In[85]:

#创建模型2
lsvc = LinearSVC(maxIter=5)
#paramGrid2 = ParamGridBuilder().addGrid(lsvc.regParam, [0.3, 0.01]).addGrid(lsvc.maxIter, [10, 5]).build()
paramGrid2 = ParamGridBuilder().addGrid(lsvc.regParam, [0.3, 0.01]).build()
evaluator2 = MulticlassClassificationEvaluator(metricName="f1")
crossval1 = CrossValidator(estimator=lsvc,
                           estimatorParamMaps=paramGrid2,
                           evaluator=evaluator2,
                           numFolds=3)
lsvc_model = crossval1.fit(train)
print('lsvcpre value: {}'.format(
    evaluator2.evaluate(lsvc_model.transform(validation))))

# In[86]:

lsvc_model.getEstimatorParamMaps()
예제 #14
0
         "label",
         when(col("a.id") == col("b.id"),
              lit("1")).otherwise(lit("0"))).selectExpr(
                  "label", "text_a", "text_b")
 matched_df = joined_df.where(col("label") == 1)
 not_matched_df = joined_df.where(col("label") == 0).limit(1000)
 labeled_df = matched_df.unionAll(not_matched_df)
 labeled_df.show(10, False)
 pipeline_model = pipeline.fit(labeled_df)
 transform_df = pipeline_model.transform(labeled_df).selectExpr(
     "cast(label as double) label", "features")
 # view the transformed data
 (train_df, test_df) = transform_df.randomSplit([0.7, 0.3], 24)
 logging.info("Count of training data: {}".format(train_df.count()))
 logging.info("Count of testing data: {}".format(test_df.count()))
 svm = LinearSVC(maxIter=5, regParam=0.01)
 model = svm.fit(train_df)
 logging.info("Model Coefficient {}".format(model.coefficients))
 logging.info("Model Intercept {}".format(model.intercept))
 logging.info("Model number of classes {}".format(model.numClasses))
 logging.info("Model number of features {}".format(model.numFeatures))
 predictions = model.transform(test_df)
 evaluator_svm = BinaryClassificationEvaluator(
     rawPredictionCol="prediction")
 area_under_curve = evaluator_svm.evaluate(predictions)
 logging.info("Area Under Curve is {}".format(area_under_curve))
 new_df = spark.createDataFrame([
     ("ALIABBAS BHOJANI", "LIABBAS BHOJANI"),
     ("ALIABBAS BHOJANI", "MUSTAFA CHALLAWALA")
 ]).toDF("text_a", "text_b").select(
     split(col("text_a"), " ").alias("text_a"),
예제 #15
0
def SparkML(train_df,
            test_df=None,
            featuresCol='features',
            labelCol='label',
            binaryclass=False,
            multiclass=False,
            n_cluster=2,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=10,
            userid=3,
            itemid=3,
            itemsCol='items',
            minSupport=0.3,
            minConfidence=0.8,
            stringIndexer=False,
            inputColStringIndexer=None,
            outputColStringIndexer=None,
            oneHotEncoder=False,
            inputColOneHotEncoder=None,
            outputColOneHotEncoder=None,
            vectorAssembler=False,
            inputColsVectorAssembler=None,
            outputColsVectorAssembler=None,
            vectorIndexer=False,
            inputColsVectorIndexer=None,
            outputColsVectorIndexer=None,
            maxCategories=None,
            classification=False,
            logisticregression=False,
            decisiontreeclassifier=False,
            linearsvc=False,
            naivebayes=False,
            randomforestclassifier=False,
            gbtclassifier=False,
            regression=False,
            linearregression=True,
            decisiontreeregressor=False,
            randomforestregressor=False,
            gbtregressor=False,
            clustering=False,
            kmeans=False,
            gaussianmixture=False,
            lda=False,
            recommendation=False,
            als=False,
            association=False,
            fpgrowth=False):
    if classification:
        if logisticregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRClassifier = LogisticRegression(featuresCol=featuresCol,
                                              labelCol=labelCol,
                                              predictionCol='Prediction',
                                              probabilityCol='Probability',
                                              rawPredictionCol='RawPrediction',
                                              standardization=True,
                                              maxIter=100,
                                              regParam=0.0,
                                              elasticNetParam=0.0,
                                              tol=1e-06,
                                              fitIntercept=True,
                                              threshold=0.5)
            paramGrid = ParamGridBuilder().addGrid(
                LRClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            LRCV = CrossValidator(estimator=LRClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LRC_Pipeline = Pipeline(stages=stagesList)
            LRC_PipelineModel = LRC_Pipeline.fit(train_df)
            LRC_Predicted = LRC_PipelineModel.transform(test_df)
            LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel
            LRC_Probability = LRC_Predicted.select("Probability").toPandas()
            LRC_Prediction = LRC_Predicted.select("Prediction").toPandas()
            LRC_Score = evaluator.evaluate(LRC_Predicted)
            return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score
        if decisiontreeclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTClassifier = DecisionTreeClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTClassifier.impurity, ["gini", "entropy"]).addGrid(
                    DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        DTClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            DTCV = CrossValidator(estimator=DTClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(DTCV)
            DTC_Pipeline = Pipeline(stages=stagesList)
            DTC_PipelineModel = DTC_Pipeline.fit(train_df)
            DTC_Predicted = DTC_PipelineModel.transform(test_df)
            DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel
            DTC_Probability = DTC_Predicted.select("Probability").toPandas()
            DTC_Prediction = DTC_Predicted.select("Prediction").toPandas()
            DTC_Score = evaluator.evaluate(DTC_Predicted)
            return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score
        if linearsvc:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            SVClassifier = LinearSVC(featuresCol=featuresCol,
                                     labelCol=labelCol,
                                     predictionCol='Prediction',
                                     rawPredictionCol='RawPrediction',
                                     maxIter=100,
                                     regParam=0.0,
                                     tol=1e-06,
                                     fitIntercept=True,
                                     standardization=True,
                                     threshold=0.0)
            paramGrid = ParamGridBuilder().addGrid(
                SVClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    SVClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            SVCV = CrossValidator(estimator=SVClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(SVCV)
            SVC_Pipeline = Pipeline(stages=stagesList)
            SVC_PipelineModel = SVC_Pipeline.fit(train_df)
            SVC_Predicted = SVC_PipelineModel.transform(test_df)
            SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel
            SVC_Prediction = SVC_Predicted.select("Prediction").toPandas()
            SVC_Score = evaluator.evaluate(SVC_Predicted)
            return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score
        if naivebayes:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            NBClassifier = NaiveBayes(featuresCol=featuresCol,
                                      labelCol=labelCol,
                                      predictionCol='Prediction',
                                      probabilityCol='Probability',
                                      rawPredictionCol='RawPrediction',
                                      smoothing=1.0,
                                      modelType='multinomial',
                                      thresholds=None)
            paramGrid = ParamGridBuilder().addGrid(
                NBClassifier.smoothing,
                [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            NBCV = CrossValidator(estimator=NBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(NBCV)
            NBC_Pipeline = Pipeline(stages=stagesList)
            NBC_PipelineModel = NBC_Pipeline.fit(train_df)
            NBC_Predicted = NBC_PipelineModel.transform(test_df)
            NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel
            NBC_Probability = NBC_Predicted.select("Probability").toPandas()
            NBC_Prediction = NBC_Predicted.select("Prediction").toPandas()
            NBC_Score = evaluator.evaluate(NBC_Predicted)
            return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score
        if randomforestclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFClassifier = RandomForestClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                numTrees=20,
                featureSubsetStrategy='auto',
                seed=None,
                subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                RFClassifier.impurity, ["gini", "entropy"]).addGrid(
                    RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        RFClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).addGrid(
                            RFClassifier.numTrees,
                            [5, 10, 20, 50, 100, 200]).addGrid(
                                RFClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            RFCV = CrossValidator(estimator=RFClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(RFCV)
            RFC_Pipeline = Pipeline(stages=stagesList)
            RFC_PipelineModel = RFC_Pipeline.fit(train_df)
            RFC_Predicted = RFC_PipelineModel.transform(test_df)
            RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel
            RFC_Probability = RFC_Predicted.select("Probability").toPandas()
            RFC_Prediction = RFC_Predicted.select("Prediction").toPandas()
            RFC_Score = evaluator.evaluate(RFC_Predicted)
            return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score
        if gbtclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBClassifier = GBTClassifier(featuresCol=featuresCol,
                                         labelCol=labelCol,
                                         predictionCol='Prediction',
                                         maxDepth=5,
                                         maxBins=32,
                                         minInstancesPerNode=1,
                                         minInfoGain=0.0,
                                         lossType='logistic',
                                         maxIter=20,
                                         stepSize=0.1,
                                         seed=None,
                                         subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBClassifier.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBClassifier.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = MulticlassClassificationEvaluator(
                labelCol=labelCol,
                predictionCol="Prediction",
                metricName="accuracy")
            GBCV = CrossValidator(estimator=GBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GBCV)
            GBC_Pipeline = Pipeline(stages=stagesList)
            GBC_PipelineModel = GBC_Pipeline.fit(train_df)
            GBC_Predicted = GBC_PipelineModel.transform(test_df)
            GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel
            GBC_Prediction = GBC_Predicted.select("Prediction").toPandas()
            GBC_Score = evaluator.evaluate(GBC_Predicted)
            return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score
    if regression:
        if linearregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRegressor = LinearRegression(featuresCol=featuresCol,
                                          labelCol=labelCol,
                                          predictionCol='Prediction',
                                          standardization=True,
                                          fitIntercept=True,
                                          loss='squaredError',
                                          maxIter=100,
                                          regParam=0.0,
                                          elasticNetParam=0.0,
                                          tol=1e-06,
                                          epsilon=1.35)
            paramGrid = ParamGridBuilder().addGrid(
                LRegressor.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRegressor.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            LRCV = CrossValidator(estimator=LRegressor,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LR_Pipeline = Pipeline(stages=stagesList)
            LR_PipelineModel = LR_Pipeline.fit(train_df)
            LR_Predicted = LR_PipelineModel.transform(test_df)
            LR_BestModel = LR_PipelineModel.stages[-1].bestModel
            LR_Prediction = LR_Predicted.select("Prediction").toPandas()
            LR_Score = evaluator.evaluate(LR_Predicted)
            return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score
        if decisiontreeregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                seed=None,
                                                varianceCol=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            DTRCV = CrossValidator(estimator=DTRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(DTRCV)
            DTR_Pipeline = Pipeline(stages=stagesList)
            DTR_PipelineModel = DTR_Pipeline.fit(train_df)
            DTR_Predicted = DTR_PipelineModel.transform(test_df)
            DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel
            DTR_Prediction = DTR_Predicted.select("Prediction").toPandas()
            DTR_Score = evaluator.evaluate(DTR_Predicted)
            return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score
        if randomforestregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFRegressor = RandomForestRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                subsamplingRate=1.0,
                                                seed=None,
                                                numTrees=20)
            paramGrid = ParamGridBuilder().addGrid(
                RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        RFRegressor.numTrees,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            RFRegressor.subsamplingRate,
                            [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            RFRCV = CrossValidator(estimator=RFRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(RFRCV)
            RFR_Pipeline = Pipeline(stages=stagesList)
            RFR_PipelineModel = RFR_Pipeline.fit(train_df)
            RFR_Predicted = RFR_PipelineModel.transform(test_df)
            RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel
            RFR_Prediction = RFR_Predicted.select("Prediction").toPandas()
            RFR_Score = evaluator.evaluate(RFR_Predicted)
            return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score
        if gbtregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBRegressor = GBTRegressor(featuresCol=featuresCol,
                                       labelCol=labelCol,
                                       predictionCol='Prediction',
                                       maxDepth=5,
                                       maxBins=32,
                                       minInstancesPerNode=1,
                                       minInfoGain=0.0,
                                       subsamplingRate=1.0,
                                       lossType='squared',
                                       maxIter=20,
                                       stepSize=0.1,
                                       seed=None,
                                       impurity='variance')
            paramGrid = ParamGridBuilder().addGrid(
                GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBRegressor.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBRegressor.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBRegressor.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            GBRCV = CrossValidator(estimator=GBRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(GBRCV)
            GBR_Pipeline = Pipeline(stages=stagesList)
            GBR_PipelineModel = GBR_Pipeline.fit(train_df)
            GBR_Predicted = GBR_PipelineModel.transform(test_df)
            GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel
            GBR_Prediction = GBR_Predicted.select("Prediction").toPandas()
            GBR_Score = evaluator.evaluate(GBR_Predicted)
            return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score
    if clustering:
        if kmeans:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            KCluster = KMeans(featuresCol=featuresCol,
                              predictionCol='Prediction',
                              k=n_cluster,
                              initMode='k-means||',
                              initSteps=2,
                              tol=0.0001,
                              maxIter=20,
                              seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid(
                    KCluster.maxIter,
                    [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                        KCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            KMCV = CrossValidator(estimator=KCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(KMCV)
            KMC_Pipeline = Pipeline(stages=stagesList)
            KMC_PipelineModel = KMC_Pipeline.fit(train_df)
            KMC_Predicted = KMC_PipelineModel.transform(train_df)
            KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel
            KMC_Prediction = KMC_Predicted.select("Prediction").toPandas()
            KMC_Score = evaluator.evaluate(KMC_Predicted)
            return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score
        if gaussianmixture:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GMCluster = GaussianMixture(featuresCol=featuresCol,
                                        predictionCol='Prediction',
                                        probabilityCol='Probability',
                                        k=n_cluster,
                                        tol=0.01,
                                        maxIter=100,
                                        seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                GMCluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    GMCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            GMCV = CrossValidator(estimator=GMCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GMCV)
            GMC_Pipeline = Pipeline(stages=stagesList)
            GMC_PipelineModel = GMC_Pipeline.fit(train_df)
            GMC_Predicted = GMC_PipelineModel.transform(train_df)
            GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel
            GMC_Probability = GMC_Predicted.select("Probability").toPandas()
            GMC_Prediction = GMC_Predicted.select("Prediction").toPandas()
            GMC_Score = evaluator.evaluate(GMC_Predicted)
            return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score
        if lda:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LDACluster = LDA(featuresCol=featuresCol,
                             maxIter=20,
                             seed=None,
                             k=n_cluster,
                             learningOffset=1024.0,
                             learningDecay=0.51,
                             subsamplingRate=0.05)
            paramGrid = ParamGridBuilder().addGrid(
                LDACluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    LDACluster.seed, [i for i in range(1001)]).addGrid(
                        LDACluster.subsamplingRate,
                        [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            LDACV = CrossValidator(estimator=LDACluster,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(LDACV)
            LDA_Pipeline = Pipeline(stages=stagesList)
            LDA_PipelineModel = LDA_Pipeline.fit(train_df)
            LDA_Predicted = LDA_PipelineModel.transform(train_df)
            LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel
            LDA_Topics = LDA_BestModel.describeTopics().toPandas()
            LDA_Score = evaluator.evaluate(LDA_Predicted)
            return LDA_BestModel, LDA_Topics, LDA_Score
    if recommendation:
        if als:
            ALSR = ALS(userCol=userCol,
                       itemCol=itemCol,
                       ratingCol=ratingCol,
                       rank=rank,
                       maxIter=10,
                       regParam=0.1,
                       numUserBlocks=10,
                       numItemBlocks=10,
                       alpha=1.0,
                       seed=1)
            ALSR_Model = ALSR.fit(train_df)
            ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid)
            ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid)
            return ALSR_Model, ALSR_ForUsers, ALSR_ForItems
    if association:
        if fpgrowth:
            fpg = FPGrowth(minSupport=minSupport,
                           minConfidence=minConfidence,
                           itemsCol=itemsCol,
                           predictionCol='Prediction')
            fpg_model = fpg.fit(train_df)
            fpg_freqItemsets = fpg_model.freqItemsets.toPandas()
            fpg_associationRules = fpg_model.associationRules.toPandas()
            return fpg_model, fpg_freqItemsets, fpg_associationRules
evaluator = BinaryClassificationEvaluator(labelCol="model_photography")

print("\nModelo de Árbol de Decisión")
print("Test Area Under ROC: " + str(
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
print("Precision: " + str(metrics.precision(1.0)))
print("Recall: " + str(metrics.recall(1.0)))
print("F1-Score: " + str(metrics.fMeasure(1.0)))

# # Modelo de máquinas de vectores de soporte

# In[155]:

# se construye y entrena el modelo
lsvc = LinearSVC(featuresCol='features',
                 labelCol='model_photography',
                 maxIter=10,
                 regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(train_df)

# ahora se pueden hacer algunas predicciones y evaluar el rendimiento
lsv_predictions = lsvcModel.transform(test_df)
test = test_df.rdd
# Instantiate metrics object

#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = lsv_predictions.select(
    ['prediction', 'model_photography']).withColumn(
        'model_photography',
        F.col('model_photography').cast(FloatType())).orderBy('prediction')
from pyspark.sql import SparkSession

if __name__ == "__main__":

    spark_session = SparkSession\
        .builder\
        .appName("Spark SVM")\
        .getOrCreate()

    # Loads data
    dataset = spark_session\
        .read\
        .format("libsvm")\
        .load("data/classificationDataLibsvm.txt")

    dataset.printSchema()
    dataset.show()

    linear_SVM = LinearSVC(maxIter=10, regParam=0.1)

    svm_model = linear_SVM.fit(dataset)

    print("Coefficients: " + str(svm_model.coefficients))
    print("Intercept: " + str(svm_model.intercept))

    svm_model.save("SVMModel")

    spark_session.stop()


예제 #18
0
def main():
    #Encabezado del dataframe
    headings = [
        'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu',
        'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime',
        'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
        'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime',
        'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3'
    ]

    #Crear Spark Session
    spark = SparkSession.builder.appName("Student").getOrCreate()

    #--------------------------PREPROCESAMIENTO Y ENTREAMIENTO DEL MODELO--------------#

    #Crear dataframe previamente usado
    df = spark.read.csv('Datos streaming/feed/student-por1.csv',
                        sep=';',
                        header=True)
    esquema = df.schema

    #Reemplazar valores categoricos a numericos
    df = categoricalToNumerical(df)

    #Convertir los datos de string a int
    df = stringToInt(df)

    #Convertir variables categorica a numericas
    df = approvedOrReproved(df)

    #Eliminar datos atípicos
    df = dropAtypicValues(df)

    vector = VectorAssembler(inputCols=[
        'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu',
        'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime',
        'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
        'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime',
        'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2'
    ],
                             outputCol="features")

    #Adaptar los vectores al conjunto de datos
    df_temp = vector.transform(df)

    df = df_temp.drop('school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
                      'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian',
                      'traveltime', 'studytime', 'failures', 'schoolsup',
                      'famsup', 'paid', 'activities', 'nursery', 'higher',
                      'internet', 'romantic', 'famrel', 'freetime', 'goout',
                      'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2')

    svm = LinearSVC(labelCol="G3",
                    featuresCol="features",
                    maxIter=10,
                    threshold=0.5,
                    aggregationDepth=2,
                    regParam=0.0)
    model = svm.fit(df)
    model.write().overwrite().save("Modelo1")

    #---------------------------------STREAMING--------------------------------#

    #Crear dataframe para el streaming
    df = spark.readStream.csv('Datos streaming/read',
                              sep=';',
                              header=True,
                              schema=esquema)

    #Reemplazar valores categoricos a numericos
    df = categoricalToNumerical(df)

    #Convertir los datos de string a int
    df = stringToInt(df)

    #Convertir variables categorica a numericas
    df = approvedOrReproved(df)

    #Eliminar datos atípicos
    df = dropAtypicValues(df)

    vector = VectorAssembler(inputCols=[
        'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu',
        'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime',
        'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
        'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime',
        'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2'
    ],
                             outputCol="features")

    #Adaptar los vectores al conjunto de datos
    df_temp = vector.transform(df)

    df = df_temp.drop('school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
                      'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian',
                      'traveltime', 'studytime', 'failures', 'schoolsup',
                      'famsup', 'paid', 'activities', 'nursery', 'higher',
                      'internet', 'romantic', 'famrel', 'freetime', 'goout',
                      'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2')

    #Visualiación de los datos para propositos de depuración
    test = df.writeStream.format("console").outputMode("update").foreach(
        predict).trigger(processingTime='65 seconds').start()
    test.awaitTermination()
예제 #19
0
    assembler = VectorAssembler(inputCols=[
        "hair", "feathers", "eggs", "milk", "airborne", "aquatic", "predator",
        "toothed", "backbone", "breathes", "venomous", "fins", "legs", "tail",
        "domestic", "catsize"
    ],
                                outputCol="features")

    # Step - 2: Transform dataframe to vectorized dataframe
    output = assembler.transform(animals).select("features", "eatable",
                                                 "cyr_name")

    output.cache()

    # Step - 3: Set up the LinearSVC Classifier
    trainer = LinearSVC(labelCol="eatable", featuresCol="features")

    # Step - 4: Train the model
    model = trainer.fit(output)

    print("Coefficients: " + str(model.coefficients) + " Intercept: " +
          str(model.intercept))

    rawPredictions = model.transform(output)

    predictions = enrichPredictions(rawPredictions)

    predictions.show(100)

    # Step - 5: Evaluate prediction
    evaluator = BinaryClassificationEvaluator(labelCol="eatable",
예제 #20
0
# precision:       0.22341727876880027
# recall:          0.6314878892733564
# accuracy:        0.8737938503096747
# auroc:           0.858413849659377
# 19
 
#============Linear SVM Classifier
# LinearSVC(featuresCol='features', labelCol='label', predictionCol='prediction',
# maxIter=100, regParam=0.0, tol=1e-06, rawPredictionCol='rawPrediction', fitIntercept=True, standardization=True,
# threshold=0.0, weightCol=None, aggregationDepth=2) 
#This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. Only supports L2 regularization currently.
 
  
#standardization=True
a=datetime.now() 
svm = LinearSVC(featuresCol='raw_Features', labelCol='Class')   
svm_model = svm.fit(training_downsampled)     
predictions = svm_model.transform(test_downsampled)  #test
predictions.cache()
print_binary_metrics(predictions)
b=datetime.now() 
print((b-a).seconds)     
 
 

# actual total:    82183
# actual positive: 4046
# actual negative: 78137
# nP:              12611
# nN:              69572
# TP:              2653
예제 #21
0
print("Correct : ",  correct)
print("Wrong: ",  wrong)
print("Ratio wrong: " , ratioWrong)
print("Ratio correct: ",  ratioCorrect)
print("Ratio true positive : ",  truep)
print("Ratio false positive : ",  falsep)

print("Ratio true negative : ",  truen)
print("Ratio false negative : ",  falsen)

# COMMAND ----------

#CV model of LSVC
from pyspark.ml.classification import LinearSVC,  LinearSVCModel
svm = ( LinearSVC()
        .setFeaturesCol("features")
        .setLabelCol("label")
     )
from pyspark.ml import Pipeline

pipeline =  Pipeline().setStages([
  ipindexer, # categorize internation_plan
  labelindexer, # categorize churn
  assembler, # assemble the feature vector for all columns
  svm])
pipelineModel = pipeline.fit(trainDF)

numFolds = 3
MaxIter = [1000]
RegParam = [0.1, 0.01] # L2 regularization param, set 1.0 with L1 regularization
Tol=[1e-8] # for convergence tolerance for iterative algorithms
예제 #22
0
파일: vista.py 프로젝트: Advitya17/Vista
def downstream_ml_func(features_df,
                       results_dict,
                       layer_index,
                       model_name='LogisticRegression',
                       extra_config={},
                       tuning_method=None,
                       seed=2019,
                       test_size=0.2):
    def hyperparameter_tuned_model(clf, train_df):
        pipeline = Pipeline(stages=[clf])

        paramGrid = ParamGridBuilder()
        for i in extra_config:
            if i == 'numFolds':
                continue
            paramGrid = paramGrid.addGrid(eval('clf.' + i), extra_config[i])

        paramGrid = paramGrid.build()
        evaluator = MulticlassClassificationEvaluator()

        if tuning_method == 'CrossValidator':

            if 'numFolds' in extra_config:
                numFolds = extra_config['numFolds']
            else:
                numFolds = 3  # default

            val_model = CrossValidator(estimator=pipeline,
                                       estimatorParamMaps=paramGrid,
                                       evaluator=evaluator,
                                       numFolds=numFolds,
                                       seed=seed)

        if tuning_method == 'TrainValidationSplit':

            val_model = TrainValidationSplit(
                estimator=pipeline,
                estimatorParamMaps=paramGrid,
                evaluator=evaluator,
                seed=seed,
                # 80% of the data will be used for training, 20% for validation.
                trainRatio=1 - test_size)

    # Run cross-validation, and choose the best set of parameters.
        return val_model.fit(train_df)

    train_df, test_df = features_df.randomSplit([1 - test_size, test_size],
                                                seed=seed)

    if model_name == 'LogisticRegression':
        clf = LogisticRegression(labelCol="label",
                                 featuresCol="features",
                                 maxIter=10,
                                 regParam=0.1)

    if model_name == 'LinearSVC':
        clf = LinearSVC(maxIter=5, regParam=0.01)

    if model_name == 'DecisionTreeClassifier':
        stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        si_model = stringIndexer.fit(train_df)
        train_df = si_model.transform(train_df)

        clf = DecisionTreeClassifier(maxDepth=2, labelCol="indexed", seed=seed)

    if model_name == 'GBTClassifier':
        stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        si_model = stringIndexer.fit(train_df)
        train_df = si_model.transform(train_df)

        clf = GBTClassifier(labelCol="label",
                            featuresCol="features",
                            maxIter=50,
                            maxDepth=5,
                            seed=seed)

    if model_name == 'RandomForestClassifier':
        stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        si_model = stringIndexer.fit(train_df)
        td = si_model.transform(train_df)

        clf = RandomForestClassifier(labelCol="label",
                                     featuresCol="features",
                                     seed=seed)

    if model_name == 'OneVsRest':
        lr = LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=50,
                                regParam=0.5)
        clf = OneVsRest(labelCol="label",
                        featuresCol="features",
                        predictionCol="prediction",
                        classifier=lr)

    if tuning_method is not None:
        model = hyperparameter_tuned_model(clf, train_df)
    else:
        model = clf.fit(train_df)

    predictions = model.transform(test_df)

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    results_dict[layer_index] = evaluator.evaluate(predictions)
    return results_dict
예제 #23
0
rando_forest_model = rando_forest.fit(train)
rando_forest_preds = rando_forest_model.transform(validation)
custom_evaluation(rando_forest_preds, 'Random Forest')

# In[128]:

#Gradient boosted trees (ie ada boost)
gbtrees = GBTClassifier(maxIter=10)
gbtree_model = gbtrees.fit(train)
gbtree_preds = gbtree_model.transform(validation)
custom_evaluation(gbtree_preds, 'Gradient Boosted Trees')

# In[129]:

#SVM
svm = LinearSVC(maxIter=10, regParam=0.1)
svm_model = svm.fit(train)
svm_preds = svm_model.transform(validation)
custom_evaluation(svm_preds, 'Support Vector Machine')

# In[130]:

#Logistic regression model
logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = logReg.fit(train)
lr_preds = lrModel.transform(validation)
custom_evaluation(lr_preds, 'Logistic Regression')

# In[131]:

#Visual check for predictions
예제 #24
0
teb_vectorAssembler = VectorAssembler(inputCols=[
    "Nouns", "Verbs", "Exclamations", "Question_Marks", "Interjections",
    "Ellipsis", "Capitals", "Passive_aggressive_count"
],
                                      outputCol='features')

contrast_based_transdf = cb_vectorAssembler.transform(contrast_based_features)
contrast_based_df = contrast_based_transdf.select(["features", "label"])
emotion_based_transdf = eb_vectorAssembler.transform(emotion_based_features)
emotion_based_df = emotion_based_transdf.select(["features", "label"])
text_expression_transdf = teb_vectorAssembler.transform(
    text_expression_based_features)
text_expression_based_df = text_expression_transdf.select(
    ["features", "label"])

svc = LinearSVC(maxIter=10, regParam=0.1)
df_list = [contrast_based_df, emotion_based_df, text_expression_based_df]
RMSEs = []
MAEs = []
FS = []
Accuracies = []
Precisions = []
Recalls = []
for item in range(len(df_list)):
    print("-----------------RDD: " + str(item) + " -----------------------")
    for i in range(1, 6):
        print("---------------------FOLD " + str(i) +
              "-----------------------------")
        train, test = df_list[item].randomSplit([0.8, 0.2])
        svcModel = svc.fit(train)
        preds = svcModel.transform(test)
예제 #25
0
def linearSVC(trainingData,
              testData,
              maxIter,
              regParam,
              aggregationDepth,
              enableCrossValidator=False,
              featuresCol="features",
              labelCol="label",
              predictionCol="prediction",
              tol=1e-6,
              rawPredictionCol="rawPrediction",
              fitIntercept=True,
              standardization=False,
              threshold=0.0):

    print("\nInizio classificazione con LinearSVCClassifier")

    # Inizializzo il modello del classificatore con i parametri in input (e quelli default)
    lsvc = LinearSVC(featuresCol=featuresCol,
                     labelCol=labelCol,
                     predictionCol=predictionCol,
                     maxIter=maxIter,
                     regParam=regParam,
                     tol=tol,
                     rawPredictionCol=rawPredictionCol,
                     fitIntercept=fitIntercept,
                     standardization=standardization,
                     threshold=threshold,
                     aggregationDepth=aggregationDepth)

    print("    -modello creato")

    validator = None
    # In caso di cross validation
    if enableCrossValidator:
        # Creo la mappa dei parametri
        paramGrid = ParamGridBuilder().build()

        # Inizializzo l'evaluator
        evaluator = BinaryClassificationEvaluator()

        # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K
        crossVal = CrossValidator(estimator=lsvc,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5)  # use 3+ folds in practice
        validator = crossVal
    else:
        validator = lsvc

    print("    -validator creato")

    training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[
        30])).toDF(schema=['index', 'features', 'label']).orderBy('index')

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    # tokenizer = Tokenizer(inputCol="features", outputCol="transactions")
    # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29)

    pipeline = Pipeline(stages=[validator])

    model = pipeline.fit(training)

    print("    -modello addestrato con la pipeline (" + str(training.count()) +
          " elementi utilizzati come training)")

    test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF(
        schema=['label', 'features', 'index']).orderBy('index')

    # prediction = predictions, label, index
    predictionsAndLabels = model.transform(test).rdd.map(lambda x:
                                                         (x[4], x[0], x[2]))

    print("    -" + str(predictionsAndLabels.count()) +
          " elementi predetti (" + str(test.count()) +
          " elementi usati come test)")

    return predictionsAndLabels
예제 #26
0
                                         layers=layers,
                                         blockSize=128,
                                         seed=1234)
# train the model
model = trainer.fit(train)
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

from pyspark.ml.classification import LinearSVC
# Load training data
training = spark.read.format("libsvm").load(
    "file:///usr/local/spark/data/mllib/sample_libsvm_data.txt")
lsvc = LinearSVC(maxIter=10, regParam=0.1)
# Fit the model
lsvcModel = lsvc.fit(training)
# Print the coefficients and intercept for linear SVC
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# load data file.
inputData = spark.read.format("libsvm").load(
    "file:///usr/local/spark/data/mllib/sample_multiclass_classification_data.txt"
)
# generate the train/test split.
(train, test) = inputData.randomSplit([0.8, 0.2])
# instantiate the base classifier.
def SVM(trainingData, testData):
    start_time = time.time()
    print(" ")
    print("--------------------- SUPPORT VECTOR MACHINE ---------------------")

    svm = LinearSVC()
    ovr = OneVsRest(classifier=svm)

    # Parametri su cui effettuare il tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(svm.regParam, [1, 0]) \
        .addGrid(svm.maxIter, [100, 1000]) \
        .build()

    # Tuning sui vari parametri per scegliere il modello migliore
    tvs = TrainValidationSplit(estimator=ovr,
                               estimatorParamMaps=paramGrid,
                               evaluator=MulticlassClassificationEvaluator(),
                               # Validation test: 80% traning, 20% validation.
                               trainRatio=0.8)

    model = tvs.fit(trainingData)

    prediction = model.transform(testData)

    result = prediction.select('features', 'label', 'prediction')

    # Calcolo accuracy
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1score = evaluator.evaluate(prediction)

    # Confusion Matrix
    class_temp = prediction.select("label").groupBy("label") \
        .count().sort('count', ascending=False).toPandas()
    class_temp = class_temp["label"].values.tolist()

    y_true = prediction.select("label")
    y_true = y_true.toPandas()

    y_pred = prediction.select("prediction")
    y_pred = y_pred.toPandas()

    cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp)


    print("Accuracy Hold-Out: ", accuracy)
    print("F1-Score Hold-Out: ", f1score)
    print("")
    print("")
    print("Doc Parameters : [", model.explainParams(), "]")
    print("")
    print("")
    print("Confusion Matrix: ")
    print(cnf_matrix)
    print("SVM HoldOut Execution TIME:", time.time() - start_time)

    # Richiamo SVM che utilizza la validazione K-Folds
    f1score_cv, cnf_matrix_cv, cv = SVMCV(trainingData, testData)

    # Restituisco il modello migliore tra Hold Out e K-Folds
    if (f1score <= f1score_cv):
        return (f1score_cv, cnf_matrix_cv, cv)
    else:
        return (f1score, cnf_matrix, tvs)
예제 #28
0
파일: test.py 프로젝트: Javi96/SGDI
#print("RandomForestClassifier parameters:\n" + rf.explainParams() + "\n")
model = rf.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("RandomForestClassifier - Test set accuracy = " + str(accuracy))

gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
#print("GBTClassifier parameters:\n" + gbt.explainParams() + "\n")
model = gbt.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("GBTClassifier - Test set accuracy = " + str(accuracy))

lsvc = LinearSVC(maxIter=10, regParam=0.1)
#print("LinearSVC parameters:\n" + lsvc.explainParams() + "\n")
model = lsvc.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("LinearSVC - Test set accuracy = " + str(accuracy))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
#print("NaiveBayes parameters:\n" + nb.explainParams() + "\n")
model = nb.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("NaiveBayes - Test set accuracy = " + str(accuracy))
'''def cleanup_age():
예제 #29
0
 def lsvc(self, maxIter=10, regParam=0.1):
     self.time_calc.start_time('\nLinear Support Vector Machine')
     lsvc = LinearSVC(maxIter=maxIter, regParam=regParam)
     self.classify('lsvc', lsvc)
     self.time_calc.end_time('Linear Support Vector Machine')
예제 #30
0
파일: linearsvc.py 프로젝트: 11wzy001/spark
from __future__ import print_function

# $example on$
from pyspark.ml.classification import LinearSVC
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("linearSVC Example")\
        .getOrCreate()

    # $example on$
    # Load training data
    training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    lsvc = LinearSVC(maxIter=10, regParam=0.1)

    # Fit the model
    lsvcModel = lsvc.fit(training)

    # Print the coefficients and intercept for linearsSVC
    print("Coefficients: " + str(lsvcModel.coefficients))
    print("Intercept: " + str(lsvcModel.intercept))

    # $example off$

    spark.stop()
예제 #31
0
    predictionAndLabels = result_MLP.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy_MLP = evaluator.evaluate(predictionAndLabels)
    print("Accuracy MLP = " + str(accuracy_MLP))

    file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n")
    file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n')

    print("\n======================================================= ")
    print("====================== LINEAR SVC ===================== ")
    print("=======================================================\n")

    print("\n================== Training ===================\n")

    #training model SVC
    trainer_SVC = LinearSVC(maxIter=10, regParam=0.1)
    model_linear_svc = trainer_SVC.fit(rescaledData)
    print("Done : Linear_SVC training")

    print("\n=================== Testing =================== \n")

    #SVC test
    predictions_svc = model_linear_svc.transform(rescaled_test_df)
    #predictions_svc.show()

    num_pos_svc = predictions_svc.select("prediction").rdd.map(
        lambda x: x["prediction"]).countByValue()[1.0]
    num_neg_svc = predictions_svc.select("prediction").rdd.map(
        lambda x: x["prediction"]).countByValue()[0.0]

    print("\n== PREDICTION SVC : ==")