def parse_dataframe(json_data): print(json_data) r = convert_single_object_per_line(json_data) mylist = [] for line in r.splitlines(): mylist.append(line) rdd = sc.parallelize(mylist) df = sqlContext.jsonRDD(rdd) return df
def getLabelsAndPredictions(best_result_lines, destination_file, used_dataset): datas = SetsCreation.setsCreation(1, used_dataset) (trainingData, testData) = datas[0] labelsAndPredictions = {} with open(str(Path(__file__).parent) + "/CSV_Results/" + destination_file + ".csv", "w") as ensemble_metric: csvWriter = csv.writer(ensemble_metric) csvWriter.writerow(['EnsembleType', 'Sensitivity', 'Fallout', 'Specificity', 'Miss_Rate', 'Test_Err', 'AUC']) for i in range(len(best_result_lines)): row = best_result_lines[i] parameters = row[2] if i is 0: labelsAndPredictions.update({row[0]: DecisionTree.decisionTree(trainingData, testData, parameters[0], int(parameters[1]), int(parameters[2]), True).collect()}) print("1/5") elif i is 1: labelsAndPredictions.update({row[0]: RandomForest.randomForest(trainingData, testData, parameters[0], int(parameters[1]), int(parameters[2]), int(parameters[3]), True).collect()}) print("2/5") elif i is 2: #labelsAndPredictions.update({row[0]: GradientBoostedTree.gradientBoostedTrees(trainingData, # testData, # int(parameters[2]), # int(parameters[0]), # int(parameters[1]), # True).collect()}) print("3/5 SKIPPED (GBT)") elif i is 3: labelsAndPredictions.update({row[0]: LogisticRegression.logisticRegression(trainingData, testData, int(parameters[0]), float(parameters[1]), float(parameters[2]), int(parameters[3]), True).collect()}) print("4/5") elif i is 4: labelsAndPredictions.update({row[0]: LinearSVC.linearSVC(trainingData, testData, int(parameters[0]), float(parameters[1]), int(parameters[2]), True).collect()}) print("5/5") for key in list(labelsAndPredictions.keys()): result = ma.metricsEvalutation(sc.parallelize(labelsAndPredictions[key]), len(labelsAndPredictions[key]), False) classifier_name = "" if key == "Decision_Tree": classifier_name = "DT" elif key == "Random_Forest": classifier_name = "RF" elif key == "Gradient_Boosted_Tree": classifier_name = "GBT" print("GBT SKIPPED") continue elif key == "Logistic_Regression": classifier_name = "LR" elif key == "Linear_SVC": classifier_name = "LSVC" csvWriter.writerow([classifier_name, str(result.sensitivity), str(result.fallout), str(result.specificity), str(result.missRate), str(result.testErr), str(result.AUC)]) return labelsAndPredictions
if x.isdigit(): print("before") print(x) if len(customerlist) == 0 or customerlist[len(customerlist) - 1] != x: if len(customerlist) != 0: list.append(Row(sequence=anotherList)) customerlist.append(x) sequence = [] anotherList = [] if x != "" and not x.isdigit(): list2.append(x) anotherList.append(list2) list.append(Row(sequence=anotherList)) print(list) df = sc.parallelize(list).toDF() prefixSpan = PrefixSpan(minSupport=0.1, maxPatternLength=3, maxLocalProjDBSize=32000000) # Find frequent sequential patterns. prefixSpan.findFrequentSequentialPatterns(df).sort(desc("freq")).show( 10, False)
def ensembler(predALab, destination_file): ensemblePair = {} print("Esecuzione ensemble pairs") ensemblePair.update({'DT RF': majorityVotePairs(predALab['Decision_Tree'], predALab['Random_Forest'])}) #ensemblePair.update({'DT GBT': majorityVotePairs(predALab['Decision_Tree'], # predALab['Gradient_Boosted_Tree'])}) ensemblePair.update({'DT LR': majorityVotePairs(predALab['Decision_Tree'], predALab['Logistic_Regression'])}) ensemblePair.update({'DT LSVC': majorityVotePairs(predALab['Decision_Tree'], predALab['Linear_SVC'])}) #ensemblePair.update({'RF GBT': majorityVotePairs(predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'])}) ensemblePair.update({'RF LR': majorityVotePairs(predALab['Random_Forest'], predALab['Logistic_Regression'])}) ensemblePair.update({'RF LSVC': majorityVotePairs(predALab['Random_Forest'], predALab['Linear_SVC'])}) #ensemblePair.update({'GBT LR': majorityVotePairs(predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'])}) #ensemblePair.update({'GBT LSVC': majorityVotePairs(predALab['Gradient_Boosted_Tree'], # predALab['Linear_SVC'])}) ensemblePair.update({'LR LSVC': majorityVotePairs(predALab['Logistic_Regression'], predALab['Linear_SVC'])}) print("Ensemble pairs eseguiti") ensembleTriple = {} print("Esecuzione ensemble triple") #ensembleTriple.update({'DT RF GBT': majorityVoteTriple(predALab['Decision_Tree'], # predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'])}) ensembleTriple.update({'DT RF LR': majorityVoteTriple(predALab['Decision_Tree'], predALab['Random_Forest'], predALab['Logistic_Regression'])}) ensembleTriple.update({'DT RF LSVC': majorityVoteTriple(predALab['Decision_Tree'], predALab['Random_Forest'], predALab['Linear_SVC'])}) #ensembleTriple.update({'DT GBT LR': majorityVoteTriple(predALab['Decision_Tree'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'])}) #ensembleTriple.update({'DT GBT LSVC': majorityVoteTriple(predALab['Decision_Tree'], # predALab['Gradient_Boosted_Tree'], # predALab['Linear_SVC'])}) ensembleTriple.update({'DT LR LSVC': majorityVoteTriple(predALab['Decision_Tree'], predALab['Logistic_Regression'], predALab['Linear_SVC'])}) #ensembleTriple.update({'RF GBT LR': majorityVoteTriple(predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'])}) #ensembleTriple.update({'RF GBT LSVC': majorityVoteTriple(predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Linear_SVC'])}) ensembleTriple.update({'RF LR LSVC': majorityVoteTriple(predALab['Random_Forest'], predALab['Logistic_Regression'], predALab['Linear_SVC'])}) #ensembleTriple.update({'GBT LR LSVC': majorityVoteTriple(predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'], # predALab['Linear_SVC'])}) print("Ensemble triple eseguiti") ensembleQuadruple = {} print("Esecuzione ensemble quadruple") #ensembleQuadruple.update({'DT RF GBT LR': majorityVoteQuadruple(predALab['Decision_Tree'], # predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'])}) #ensembleQuadruple.update({'DT RF GBT LSVC': majorityVoteQuadruple(predALab['Decision_Tree'], # predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Linear_SVC'])}) ensembleQuadruple.update({'DT RF LR LSVC': majorityVoteQuadruple(predALab['Decision_Tree'], predALab['Random_Forest'], predALab['Logistic_Regression'], predALab['Linear_SVC'])}) #ensembleQuadruple.update({'DT GBT LR LSVC': majorityVoteQuadruple(predALab['Decision_Tree'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'], # predALab['Linear_SVC'])}) #ensembleQuadruple.update({'RF GBT LR LSVC': majorityVoteQuadruple(predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'], # predALab['Linear_SVC'])}) print("Ensemble quadruple eseguiti") ensembleQuintuple = {} #print("Esecuzione ensemble quintuple") #ensembleQuintuple.update({'DT RF GBT LR LSVC': majorityVoteQuintuple(predALab['Decision_Tree'], # predALab['Random_Forest'], # predALab['Gradient_Boosted_Tree'], # predALab['Logistic_Regression'], # predALab['Linear_SVC'])}) #print("Ensemble quintuple eseguito") result = {} for i in range(len(list(ensemblePair.keys()))): result.update({list(ensemblePair.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensemblePair.values())[i]), len(list(ensemblePair.values())[i]), False)}) for i in range(len(list(ensembleTriple.keys()))): result.update({list(ensembleTriple.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensembleTriple.values())[i]), len(list(ensembleTriple.values())[i]), False)}) for i in range(len(list(ensembleQuadruple.keys()))): result.update({list(ensembleQuadruple.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensembleQuadruple.values())[i]), len(list(ensembleQuadruple.values())[i]), False)}) #result.update({list(ensembleQuintuple.keys())[0]: ma.metricsEvalutation(sc.parallelize(list(ensembleQuintuple.values())[0]), len(list(ensembleQuintuple.values())[0]), False)}) with open(str(Path(__file__).parent) + "/CSV_Results/" + destination_file + ".csv", "a") as ensemble_metric: csvWriter = csv.writer(ensemble_metric) # csvWriter.writerow(['EnsembleType', 'Sensitivity', 'Fallout', 'Specificity', 'Miss_Rate', 'Test_Err', 'AUC']) for i in range(len(list(result.keys()))): csvWriter.writerow([list(result.keys())[i], str(list(result.values())[i].sensitivity), str(list(result.values())[i].fallout), str(list(result.values())[i].specificity), str(list(result.values())[i].missRate), str(list(result.values())[i].testErr), str(list(result.values())[i].AUC)]) csvWriter.writerow(["##################"]) csvWriter.writerow(['DT = Decision_Tree']) csvWriter.writerow(['RF = Random_Forest']) #csvWriter.writerow(['GBT = Gradient_Boosted_Tree']) csvWriter.writerow(['LR = Logistic_Regression']) csvWriter.writerow(['LSVC = Linear_SVC'])
import pickle import sys import time from numpy.linalg import norm from pyspark.shell import sc import matplotlib.pyplot as plt def parse_data(row): ''' Parse each pandas row into a tuple of (station_name, feature_vec),`l where feature_vec is the concatenation of the projection vectors of TAVG, TRANGE, and SNWD. ''' return (row[0], np.concatenate([row[1], row[2], row[3]])) ## Read data data = pickle.load(open("stations_projections.pickle", "rb")) rdd = sc.parallelize([parse_data(row[1]) for row in data.iterrows()]) rdd.take(1) # Number of centroids K = 5 # Number of K-means runs that are executed in parallel. Equivalently, number of sets of initial points RUNS = 25 # For reproducability of results RANDOM_SEED = 60295531 # The K-means algorithm is terminated when the change in the # location of the centroids is smaller than 0.1 converge_dist = 0.1
from pyspark import SparkContext, SparkConf from pyspark.shell import sc data = [1, 2, 3, 4, 5] distData = sc.parallelize(data) distData.collect() #data = [1, 2, 3, 4, 5] #distData = sc.parallelize(data) #print(distData.reduce(lambda a, b: a + b)) #csv_data = sc.csv("file:///home/hpinto/Desktop/mycloud/DimCurrency.csv") #csv_data = csv_data.map(lambda p: p.split(",")) #df_csv = csv_data.map(lambda p: Row(CurrencyKey= p[0],CurrencyAlternateKey = p[1],CurrencyName = p[2])).toDF() #df_csv.write.format("orc").saveAsTable("employeesZZZ")
def tvars(prefix: str, fr: int, to: int) -> RDD: data = range(fr, to + 1) distData: RDD = sc.parallelize(data) return distData \ .map(lambda x: (x, f"{prefix}{x}")) \ .sortBy(lambda x: x)
# -*- coding:utf-8 -*- from pyspark.ml.stat import Summarizer from pyspark.shell import sc from pyspark.sql import Row from pyspark.ml.linalg import Vectors df = sc.parallelize([ Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0)) ]).toDF() # create summarizer for multiple metrics "mean" and "count" summarizer = Summarizer.metrics("mean", "count") # compute statistics for multiple metrics with weight df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) # compute statistics for multiple metrics without weight df.select(summarizer.summary(df.features)).show(truncate=False) # compute statistics for single metric "mean" with weight df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) # compute statistics for single metric "mean" without weight df.select(Summarizer.mean(df.features)).show(truncate=False)
import pyspark import pyspark.sql.types as typ from pyspark.shell import sc, sqlContext import csv # reading csv data from file rows = [] with open("records.csv", 'r') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: rows.append(row) rdd: pyspark.rdd.RDD = sc.parallelize(rows) """ row[0] = TimeStamp row[1] = EventType row[2] = Private IP row[3] = Private Port row[6] = Destination IP row[7] = Destination Port """ reduced = rdd.map(lambda row: ((row[2], row[3], row[6], row[7]), [(row[1], row[0])])) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))).filter(lambda row: len(row[1]) == 2) \ .map(lambda row: (row[1][0][1], row[1][1][1], row[0][0], row[0][1], row[0][2], row[0][3])) schema_red = typ.StructType([ typ.StructField('Start Date', typ.StringType(), False), typ.StructField('End Date', typ.StringType(), False),
from pyspark import Row from pyspark.ml.fpm import PrefixSpan from pyspark.shell import sc df = sc.parallelize([Row(sequence=[[1, 2], [3]]), Row(sequence=[[1], [3, 2], [1, 2]]), Row(sequence=[[1, 2], [5]]), Row(sequence=[[6]])]).toDF() prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5, maxLocalProjDBSize=32000000) # Find frequent sequential patterns. prefixSpan.findFrequentSequentialPatterns(df).show()
import pyspark import pyspark.sql.types as typ from pyspark.shell import sc, sqlContext rdd: pyspark.rdd.RDD = sc.parallelize([('xxx', 'yyy', 'zzz', 'C', 22), ('xxx', 'yyy', 'zzz', 'D', 23), ('xxx1', 'yyy1', 'zzz1', 'C', 24), ('xxx1', 'yyy1', 'zzz1', 'D', 25)]) reduced = rdd.map(lambda row: ((row[0], row[1], row[2]), [(row[3], row[4])])) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))) \ .map( lambda row: (row[0][0], row[0][1], row[0][2], ','.join([str(e[0]) for e in row[1]]), row[1][0][1], row[1][1][1])) \ .filter(lambda row: row[3] == "C,D") schema_red = typ.StructType([ typ.StructField('Data1', typ.StringType(), False), typ.StructField('Data2', typ.StringType(), False), typ.StructField('Data3', typ.StringType(), False), typ.StructField('Type', typ.StringType(), False), typ.StructField('Start Date', typ.StringType(), False), typ.StructField('End Date', typ.StringType(), False) ]) df_red = sqlContext.createDataFrame(reduced, schema_red) df_red.show() # Explanation#################################################################################################### result = rdd.map(lambda row: ((row[0], row[1], row[2]), [(row[3], row[4])])).reduceByKey(lambda x, y: x + y). \ map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))). \
from pyspark import SparkConf, SparkContext from pyspark.shell import sc accum = sc.accumulator(0) print(accum) sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x)) print(accum.value) # excetion # driver readonly # every executer write add, add, add... # total result return to driver
import random from pyspark.context import xrange from pyspark.shell import sc def inside(p): x, y = random.random(), random.random() return x*x + y*y < 1 count = sc.parallelize(xrange(0, 100000000)) \ .filter(inside).count() print ("Pi is roughly %f" % (4.0 * count / 100000000))