예제 #1
0
def parse_dataframe(json_data):
    print(json_data)
    r = convert_single_object_per_line(json_data)
    mylist = []
    for line in r.splitlines():
        mylist.append(line)
    rdd = sc.parallelize(mylist)
    df = sqlContext.jsonRDD(rdd)
    return df
예제 #2
0
def getLabelsAndPredictions(best_result_lines, destination_file, used_dataset):
    datas = SetsCreation.setsCreation(1, used_dataset)

    (trainingData, testData) = datas[0]
    labelsAndPredictions = {}

    with open(str(Path(__file__).parent) + "/CSV_Results/" + destination_file + ".csv", "w") as ensemble_metric:
        csvWriter = csv.writer(ensemble_metric)

        csvWriter.writerow(['EnsembleType', 'Sensitivity', 'Fallout', 'Specificity', 'Miss_Rate', 'Test_Err', 'AUC'])

        for i in range(len(best_result_lines)):
            row = best_result_lines[i]
            parameters = row[2]
            if i is 0:
                labelsAndPredictions.update({row[0]: DecisionTree.decisionTree(trainingData,
                                                                               testData,
                                                                               parameters[0],
                                                                               int(parameters[1]),
                                                                               int(parameters[2]), True).collect()})
                print("1/5")
            elif i is 1:
                labelsAndPredictions.update({row[0]: RandomForest.randomForest(trainingData,
                                                                               testData,
                                                                               parameters[0],
                                                                               int(parameters[1]),
                                                                               int(parameters[2]),
                                                                               int(parameters[3]),
                                                                               True).collect()})
                print("2/5")
            elif i is 2:
                #labelsAndPredictions.update({row[0]: GradientBoostedTree.gradientBoostedTrees(trainingData,
                #                                                                              testData,
                #                                                                              int(parameters[2]),
                #                                                                              int(parameters[0]),
                #                                                                              int(parameters[1]),
                #                                                                              True).collect()})
                print("3/5 SKIPPED (GBT)")
            elif i is 3:
                labelsAndPredictions.update({row[0]: LogisticRegression.logisticRegression(trainingData,
                                                                                           testData,
                                                                                           int(parameters[0]),
                                                                                           float(parameters[1]),
                                                                                           float(parameters[2]),
                                                                                           int(parameters[3]),
                                                                                           True).collect()})
                print("4/5")
            elif i is 4:
                labelsAndPredictions.update({row[0]: LinearSVC.linearSVC(trainingData,
                                                                         testData,
                                                                         int(parameters[0]),
                                                                         float(parameters[1]),
                                                                         int(parameters[2]),
                                                                         True).collect()})
                print("5/5")

        for key in list(labelsAndPredictions.keys()):
            result = ma.metricsEvalutation(sc.parallelize(labelsAndPredictions[key]), len(labelsAndPredictions[key]), False)
            classifier_name = ""

            if key == "Decision_Tree":
                classifier_name = "DT"
            elif key == "Random_Forest":
                classifier_name = "RF"
            elif key == "Gradient_Boosted_Tree":
                classifier_name = "GBT"
                print("GBT SKIPPED")
                continue
            elif key == "Logistic_Regression":
                classifier_name = "LR"
            elif key == "Linear_SVC":
                classifier_name = "LSVC"

            csvWriter.writerow([classifier_name, str(result.sensitivity), str(result.fallout),
                                str(result.specificity), str(result.missRate),
                                str(result.testErr), str(result.AUC)])

    return labelsAndPredictions
            if x.isdigit():
                print("before")
                print(x)
                if len(customerlist) == 0 or customerlist[len(customerlist) -
                                                          1] != x:

                    if len(customerlist) != 0:
                        list.append(Row(sequence=anotherList))

                    customerlist.append(x)
                    sequence = []
                    anotherList = []

            if x != "" and not x.isdigit():

                list2.append(x)

        anotherList.append(list2)
list.append(Row(sequence=anotherList))
print(list)

df = sc.parallelize(list).toDF()

prefixSpan = PrefixSpan(minSupport=0.1,
                        maxPatternLength=3,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).sort(desc("freq")).show(
    10, False)
예제 #4
0
def ensembler(predALab, destination_file):
    ensemblePair = {}

    print("Esecuzione ensemble pairs")
    ensemblePair.update({'DT RF': majorityVotePairs(predALab['Decision_Tree'],
                                                    predALab['Random_Forest'])})
    #ensemblePair.update({'DT GBT': majorityVotePairs(predALab['Decision_Tree'],
    #                                                 predALab['Gradient_Boosted_Tree'])})
    ensemblePair.update({'DT LR': majorityVotePairs(predALab['Decision_Tree'],
                                                    predALab['Logistic_Regression'])})
    ensemblePair.update({'DT LSVC': majorityVotePairs(predALab['Decision_Tree'],
                                                      predALab['Linear_SVC'])})
    #ensemblePair.update({'RF GBT': majorityVotePairs(predALab['Random_Forest'],
    #                                                 predALab['Gradient_Boosted_Tree'])})
    ensemblePair.update({'RF LR': majorityVotePairs(predALab['Random_Forest'],
                                                    predALab['Logistic_Regression'])})
    ensemblePair.update({'RF LSVC': majorityVotePairs(predALab['Random_Forest'],
                                                      predALab['Linear_SVC'])})
    #ensemblePair.update({'GBT LR': majorityVotePairs(predALab['Gradient_Boosted_Tree'],
    #                                                 predALab['Logistic_Regression'])})
    #ensemblePair.update({'GBT LSVC': majorityVotePairs(predALab['Gradient_Boosted_Tree'],
    #                                                   predALab['Linear_SVC'])})
    ensemblePair.update({'LR LSVC': majorityVotePairs(predALab['Logistic_Regression'],
                                                      predALab['Linear_SVC'])})
    print("Ensemble pairs eseguiti")

    ensembleTriple = {}

    print("Esecuzione ensemble triple")
    #ensembleTriple.update({'DT RF GBT': majorityVoteTriple(predALab['Decision_Tree'],
    #                                                       predALab['Random_Forest'],
    #                                                       predALab['Gradient_Boosted_Tree'])})
    ensembleTriple.update({'DT RF LR': majorityVoteTriple(predALab['Decision_Tree'],
                                                          predALab['Random_Forest'],
                                                          predALab['Logistic_Regression'])})
    ensembleTriple.update({'DT RF LSVC': majorityVoteTriple(predALab['Decision_Tree'],
                                                            predALab['Random_Forest'],
                                                            predALab['Linear_SVC'])})
    #ensembleTriple.update({'DT GBT LR': majorityVoteTriple(predALab['Decision_Tree'],
    #                                                       predALab['Gradient_Boosted_Tree'],
    #                                                       predALab['Logistic_Regression'])})
    #ensembleTriple.update({'DT GBT LSVC': majorityVoteTriple(predALab['Decision_Tree'],
    #                                                         predALab['Gradient_Boosted_Tree'],
    #                                                         predALab['Linear_SVC'])})
    ensembleTriple.update({'DT LR LSVC': majorityVoteTriple(predALab['Decision_Tree'],
                                                            predALab['Logistic_Regression'],
                                                            predALab['Linear_SVC'])})
    #ensembleTriple.update({'RF GBT LR': majorityVoteTriple(predALab['Random_Forest'],
    #                                                       predALab['Gradient_Boosted_Tree'],
    #                                                       predALab['Logistic_Regression'])})
    #ensembleTriple.update({'RF GBT LSVC': majorityVoteTriple(predALab['Random_Forest'],
    #                                                         predALab['Gradient_Boosted_Tree'],
    #                                                         predALab['Linear_SVC'])})
    ensembleTriple.update({'RF LR LSVC': majorityVoteTriple(predALab['Random_Forest'],
                                                            predALab['Logistic_Regression'],
                                                            predALab['Linear_SVC'])})
    #ensembleTriple.update({'GBT LR LSVC': majorityVoteTriple(predALab['Gradient_Boosted_Tree'],
    #                                                         predALab['Logistic_Regression'],
    #                                                         predALab['Linear_SVC'])})
    print("Ensemble triple eseguiti")

    ensembleQuadruple = {}

    print("Esecuzione ensemble quadruple")
    #ensembleQuadruple.update({'DT RF GBT LR': majorityVoteQuadruple(predALab['Decision_Tree'],
    #                                                                predALab['Random_Forest'],
    #                                                                predALab['Gradient_Boosted_Tree'],
    #                                                                predALab['Logistic_Regression'])})
    #ensembleQuadruple.update({'DT RF GBT LSVC': majorityVoteQuadruple(predALab['Decision_Tree'],
    #                                                                  predALab['Random_Forest'],
    #                                                                  predALab['Gradient_Boosted_Tree'],
    #                                                                  predALab['Linear_SVC'])})
    ensembleQuadruple.update({'DT RF LR LSVC': majorityVoteQuadruple(predALab['Decision_Tree'],
                                                                     predALab['Random_Forest'],
                                                                     predALab['Logistic_Regression'],
                                                                     predALab['Linear_SVC'])})
    #ensembleQuadruple.update({'DT GBT LR LSVC': majorityVoteQuadruple(predALab['Decision_Tree'],
    #                                                                  predALab['Gradient_Boosted_Tree'],
    #                                                                  predALab['Logistic_Regression'],
    #                                                                  predALab['Linear_SVC'])})
    #ensembleQuadruple.update({'RF GBT LR LSVC': majorityVoteQuadruple(predALab['Random_Forest'],
    #                                                                  predALab['Gradient_Boosted_Tree'],
    #                                                                  predALab['Logistic_Regression'],
    #                                                                  predALab['Linear_SVC'])})
    print("Ensemble quadruple eseguiti")

    ensembleQuintuple = {}

    #print("Esecuzione ensemble quintuple")
    #ensembleQuintuple.update({'DT RF GBT LR LSVC': majorityVoteQuintuple(predALab['Decision_Tree'],
    #                                                                     predALab['Random_Forest'],
    #                                                                     predALab['Gradient_Boosted_Tree'],
    #                                                                     predALab['Logistic_Regression'],
    #                                                                     predALab['Linear_SVC'])})
    #print("Ensemble quintuple eseguito")

    result = {}

    for i in range(len(list(ensemblePair.keys()))):
        result.update({list(ensemblePair.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensemblePair.values())[i]), len(list(ensemblePair.values())[i]), False)})

    for i in range(len(list(ensembleTriple.keys()))):
        result.update({list(ensembleTriple.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensembleTriple.values())[i]), len(list(ensembleTriple.values())[i]), False)})

    for i in range(len(list(ensembleQuadruple.keys()))):
        result.update({list(ensembleQuadruple.keys())[i]: ma.metricsEvalutation(sc.parallelize(list(ensembleQuadruple.values())[i]), len(list(ensembleQuadruple.values())[i]), False)})

    #result.update({list(ensembleQuintuple.keys())[0]: ma.metricsEvalutation(sc.parallelize(list(ensembleQuintuple.values())[0]), len(list(ensembleQuintuple.values())[0]), False)})

    with open(str(Path(__file__).parent) + "/CSV_Results/" + destination_file + ".csv", "a") as ensemble_metric:
        csvWriter = csv.writer(ensemble_metric)

        # csvWriter.writerow(['EnsembleType', 'Sensitivity', 'Fallout', 'Specificity', 'Miss_Rate', 'Test_Err', 'AUC'])
        for i in range(len(list(result.keys()))):
            csvWriter.writerow([list(result.keys())[i],
                                str(list(result.values())[i].sensitivity), str(list(result.values())[i].fallout),
                                str(list(result.values())[i].specificity), str(list(result.values())[i].missRate),
                                str(list(result.values())[i].testErr), str(list(result.values())[i].AUC)])
        csvWriter.writerow(["##################"])
        csvWriter.writerow(['DT = Decision_Tree'])
        csvWriter.writerow(['RF = Random_Forest'])
        #csvWriter.writerow(['GBT = Gradient_Boosted_Tree'])
        csvWriter.writerow(['LR = Logistic_Regression'])
        csvWriter.writerow(['LSVC = Linear_SVC'])
예제 #5
0
import pickle
import sys
import time
from numpy.linalg import norm
from pyspark.shell import sc
import matplotlib.pyplot as plt


def parse_data(row):
    '''
    Parse each pandas row into a tuple of
    (station_name,  feature_vec),`l
    where feature_vec is the concatenation of the projection vectors
    of TAVG, TRANGE, and SNWD.
    '''
    return (row[0],
            np.concatenate([row[1], row[2], row[3]]))
## Read data
data = pickle.load(open("stations_projections.pickle", "rb"))
rdd = sc.parallelize([parse_data(row[1])
          for row in data.iterrows()])
rdd.take(1)
# Number of centroids
K = 5
# Number of K-means runs that are executed in parallel. Equivalently, number of sets of initial points
RUNS = 25
# For reproducability of results
RANDOM_SEED = 60295531
# The K-means algorithm is terminated when the change in the
# location of the centroids is smaller than 0.1
converge_dist = 0.1
예제 #6
0
from pyspark import SparkContext, SparkConf
from pyspark.shell import sc

data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
distData.collect()

#data = [1, 2, 3, 4, 5]
#distData = sc.parallelize(data)
#print(distData.reduce(lambda a, b: a + b))

#csv_data = sc.csv("file:///home/hpinto/Desktop/mycloud/DimCurrency.csv")
#csv_data  = csv_data.map(lambda p: p.split(","))
#df_csv = csv_data.map(lambda p: Row(CurrencyKey= p[0],CurrencyAlternateKey = p[1],CurrencyName = p[2])).toDF()

#df_csv.write.format("orc").saveAsTable("employeesZZZ")
예제 #7
0
def tvars(prefix: str, fr: int, to: int) -> RDD:
    data = range(fr, to + 1)
    distData: RDD = sc.parallelize(data)
    return distData \
        .map(lambda x: (x, f"{prefix}{x}")) \
        .sortBy(lambda x: x)
예제 #8
0
# -*- coding:utf-8 -*-
from pyspark.ml.stat import Summarizer
from pyspark.shell import sc
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

df = sc.parallelize([
    Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
    Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))
]).toDF()

# create summarizer for multiple metrics "mean" and "count"
summarizer = Summarizer.metrics("mean", "count")

# compute statistics for multiple metrics with weight
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

# compute statistics for multiple metrics without weight
df.select(summarizer.summary(df.features)).show(truncate=False)

# compute statistics for single metric "mean" with weight
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

# compute statistics for single metric "mean" without weight
df.select(Summarizer.mean(df.features)).show(truncate=False)
import pyspark
import pyspark.sql.types as typ
from pyspark.shell import sc, sqlContext
import csv

# reading csv  data from file
rows = []
with open("records.csv", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        rows.append(row)

rdd: pyspark.rdd.RDD = sc.parallelize(rows)
"""
row[0] = TimeStamp
row[1] = EventType
row[2] = Private IP
row[3] = Private Port
row[6] = Destination IP
row[7] = Destination Port
"""

reduced = rdd.map(lambda row: ((row[2], row[3], row[6], row[7]), [(row[1], row[0])])) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))).filter(lambda row: len(row[1]) == 2) \
    .map(lambda row: (row[1][0][1], row[1][1][1],
                      row[0][0], row[0][1], row[0][2], row[0][3]))

schema_red = typ.StructType([
    typ.StructField('Start Date', typ.StringType(), False),
    typ.StructField('End Date', typ.StringType(), False),
예제 #10
0
from pyspark import Row
from pyspark.ml.fpm import PrefixSpan
from pyspark.shell import sc

df = sc.parallelize([Row(sequence=[[1, 2], [3]]),
                     Row(sequence=[[1], [3, 2], [1, 2]]),
                     Row(sequence=[[1, 2], [5]]),
                     Row(sequence=[[6]])]).toDF()

prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).show()
예제 #11
0
import pyspark
import pyspark.sql.types as typ
from pyspark.shell import sc, sqlContext

rdd: pyspark.rdd.RDD = sc.parallelize([('xxx', 'yyy', 'zzz', 'C', 22),
                                       ('xxx', 'yyy', 'zzz', 'D', 23),
                                       ('xxx1', 'yyy1', 'zzz1', 'C', 24),
                                       ('xxx1', 'yyy1', 'zzz1', 'D', 25)])

reduced = rdd.map(lambda row: ((row[0], row[1], row[2]), [(row[3], row[4])])) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))) \
    .map(
    lambda row: (row[0][0], row[0][1], row[0][2], ','.join([str(e[0]) for e in row[1]]), row[1][0][1], row[1][1][1])) \
    .filter(lambda row: row[3] == "C,D")

schema_red = typ.StructType([
    typ.StructField('Data1', typ.StringType(), False),
    typ.StructField('Data2', typ.StringType(), False),
    typ.StructField('Data3', typ.StringType(), False),
    typ.StructField('Type', typ.StringType(), False),
    typ.StructField('Start Date', typ.StringType(), False),
    typ.StructField('End Date', typ.StringType(), False)
])

df_red = sqlContext.createDataFrame(reduced, schema_red)
df_red.show()

# Explanation####################################################################################################
result = rdd.map(lambda row: ((row[0], row[1], row[2]), [(row[3], row[4])])).reduceByKey(lambda x, y: x + y). \
    map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))). \
예제 #12
0
from pyspark import SparkConf, SparkContext
from pyspark.shell import sc

accum = sc.accumulator(0)
print(accum)

sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x))

print(accum.value)

# excetion

# driver readonly
# every executer write add, add, add...
# total result return to driver
예제 #13
0
import random

from pyspark.context import xrange

from pyspark.shell import sc

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = sc.parallelize(xrange(0, 100000000)) \
             .filter(inside).count()
print ("Pi is roughly %f" % (4.0 * count / 100000000))