def main(train_x, train_y, test_x, test_y=None, idf=False, ngram=1, base='gs', asm=False): # Load : DF[id, url, features, label?] # The DataFrames only have a labels column if labels are given. # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens kind = 'asm' if asm else 'bytes' train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text') test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # Train the preprocessor and transform the data. prep = elizabeth.Preprocessor() prep.add(NGram(n=int(ngram))) prep.add(CountVectorizer()) if idf: prep.add(IDF()) train = prep.fit(train) test = prep.transform(test) # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction] nb = NaiveBayes(labelCol='indexedLabel').fit(train) test = nb.transform(test) test = index_labeller.transform( test) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: test = test.orderBy(test.id) test = test.withColumn( 'correct', (test.label == test.predictedClass).cast('double')) test = test.select(avg(test.correct)) print(test.show()) # If no labels are given for the test set, print predictions. else: test = test.orderBy(test.id).select(test.predictedClass) test = test.rdd.map(lambda row: int(row.predictedClass)) test = test.toLocalIterator() print(*test, sep='\n')
model = NaiveBayes() model = model.fit(train_data) # # model evaluation # In[ ]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator # In[ ]: acc_eval = MulticlassClassificationEvaluator() # In[ ]: test_results = model.transform(test_data) # In[ ]: test_results = test_results.filter(test_results['prediction'] > 0) # In[ ]: test_results.count() # In[ ]: print('F1') acc_eval.evaluate(test_results) # In[ ]:
train = splits[0] test = splits[1] # Creamos el modelo de Naive Bayes, lo entrenamos y realizamos la prediccion now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) nb = NaiveBayes(labelCol='attack_cat_index', featuresCol='features', predictionCol='prediction') nb = nb.fit(train) now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) result = nb.transform(test) #Creamos una funcion para el TPR prediction_list = result.select("attack_cat_index", "prediction").toPandas()[[ "attack_cat_index", "prediction" ]].values.tolist() def truePositiveRate(list, label): tot_count = 0 true_count = 0 for a in list: if a[0] == label: tot_count = tot_count + 1 if a[1] == label: true_count = true_count + 1
assemblerInputs = indexedCategoricalCols + numericColList assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") df = assembler.transform(df) # Indexing binary labels labeller = StringIndexer(inputCol=label, outputCol="label").fit(df) df = labeller.transform(df).select(["features", "label"]) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100) #dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt = LogisticRegression(regParam=0.01) model = dt.fit(trainingData) # Make predictions. predictions = model.transform(testData) evaluator = Evaluator() # Select example rows to display. predictions.select("prediction", "label", "features").show() # Evaluate the learned model print("LogRegression Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions))) model = NaiveBayes(thresholds=[0.1, 1.0]) model = dt.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "label", "features").show() print("Bayes Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions)))