Python NaiveBayes.transform 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.classification

클래스/타입: NaiveBayes

메소드/함수: transform

hotexamples.com에서의 예제들: 4

Python NaiveBayes.transform - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.classification.NaiveBayes.transform에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

NaiveBayes(30)

fit(30)

transform(4)

train(3)

explainParams(2)

setLabelCol(2)

setPredictionCol(2)

getModelType(1)

getSmoothing(1)

load(1)

save(1)

setFeaturesCol(1)

setThresholds(1)

write(1)

예제 #1

파일 보기

파일: naive_bayes.py 프로젝트: zachdj/elizabeth

def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')

예제 #2

파일 보기

파일: product_prediction.py.py 프로젝트: ShreyasGithub/pyspark_ccf

model = NaiveBayes()
model = model.fit(train_data)

# # model evaluation

# In[ ]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# In[ ]:

acc_eval = MulticlassClassificationEvaluator()

# In[ ]:

test_results = model.transform(test_data)

# In[ ]:

test_results = test_results.filter(test_results['prediction'] > 0)

# In[ ]:

test_results.count()

# In[ ]:

print('F1')
acc_eval.evaluate(test_results)

# In[ ]:

예제 #3

파일 보기

파일: NaiveBayes_2.4.py 프로젝트: H1j4ck3d/TFG

train = splits[0]
test = splits[1]

# Creamos el modelo de Naive Bayes, lo entrenamos y realizamos la prediccion
now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

nb = NaiveBayes(labelCol='attack_cat_index',
                featuresCol='features',
                predictionCol='prediction')
nb = nb.fit(train)

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

result = nb.transform(test)

#Creamos una funcion para el TPR
prediction_list = result.select("attack_cat_index", "prediction").toPandas()[[
    "attack_cat_index", "prediction"
]].values.tolist()


def truePositiveRate(list, label):
    tot_count = 0
    true_count = 0
    for a in list:
        if a[0] == label:
            tot_count = tot_count + 1
            if a[1] == label:
                true_count = true_count + 1

예제 #4

파일 보기

assemblerInputs = indexedCategoricalCols + numericColList
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)

# Indexing binary labels
labeller = StringIndexer(inputCol=label, outputCol="label").fit(df)
df = labeller.transform(df).select(["features", "label"])

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)

#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt = LogisticRegression(regParam=0.01)
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
evaluator = Evaluator()
# Select example rows to display.
predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("LogRegression Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))

model = NaiveBayes(thresholds=[0.1, 1.0])
model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show()

print("Bayes Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))