def initialize():
    global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold
    print("Initializing...")
    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    buckets_user = items.groupByKey().mapValues(list).filter(
        lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex(
            removeDuplicateEntriesAfter)
    print("Without Duplicates DOne..")
    # withoutDuplicates = checkM.mapPartitionsWithIndex(
    #     removeDuplicateEntries).groupByKey().mapValues(list)

    if (case == 1):
        # buckets_user = withoutDuplicates.mapPartitionsWithIndex(
        #     createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold)

        callSonPhase1(buckets_user)
        print("Initializing Phase 2.....")
        finalFreq = buckets_user.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
    if (case == 2):
        buckets_business = withoutDuplicates.mapPartitionsWithIndex(
            createBuckets_case2).groupByKey().mapValues(list)
        callSonPhase1(buckets_business)
        print("Initializing Phase 2.....")
        finalFreq = buckets_business.mapPartitionsWithIndex(
            lambda partition_index, iter_row: phase2(partition_index, iter_row)
        ).reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] >= threshold).map(lambda x: makeList(x[0]))

        # print((finalFreq.collect()))
        finalOutput = (finalFreq.collect())
        x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item)))
        # print(x)
        printingFreq(x)

        pass
示例#2
0
def initialize():
    global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    # print(columnName)
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # Getting user and their business count
    user_business = items.groupByKey().mapValues(set).collect()
    tuple_edge_list = []

    for i in range(0, len(user_business) - 1):
        for j in range(i + 1, len(user_business)):
            inter = user_business[i][1] & user_business[j][1]
            if len(inter) >= filterThreshold:
                tuple_edge_list.append(
                    (str(user_business[i][0]), str(user_business[j][0])))
                tuple_edge_list.append(
                    (str(user_business[j][0]), str(user_business[i][0])))

    totalEdges = float(len(tuple_edge_list) / 2)
    adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues(
        list).collectAsMap()
    adjacency_listMain = copy.deepcopy(adjacency_list)
    totalNodes = list(adjacency_list.keys())

    # ------------------------Newly added line------------------------
    strict_totalNodes = copy.deepcopy(totalNodes)
    # print(len(totalNodes))

    # ----------------------Part 1---------------------
    bfs(totalNodes, adjacency_list)
    print("Writing Betweenness to File....")

    # Converting into sorted List Initial Betweenness
    list_val = list(cost_dict.items())

    list_val.sort(key=lambda x: (-x[1], x[0]))
    writeToFile(list_val)
    totalNodes = copy.deepcopy(strict_totalNodes)
    # print(len(totalNodes))
    # ----------------------Part 2----------------------
    print("Creating Partitions....")
    create_components(list_val, adjacency_listMain, totalNodes, totalEdges)
    # ---------------------EoC---------------------------

    print("Duration: " + str(time.time() - t))
示例#3
0
def initialize():
    global sc, spark, items, inputfile
    print("Initializing...")
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    jsonread = sc.textFile(inputfile)
    items = jsonread.map(json.loads)
示例#4
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
# /opt/spark/bin/pyspark --master local[1] --jars /opt/symetry/lib/sym-spark-assembly.jar --driver-java-options -Dsym.lic.loc=/opt/symetry/sym.lic
# execfile('/Users/mike/rtds/master/RTLM/ScalaProjects/sym-shell/src/com/sml/examples/python/amazonExample.py')

import os
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.context import SparkConf
from pyspark.sql import SQLContext, HiveContext
from pyspark.storagelevel import StorageLevel

print("amazonExample.py start")

conf = SparkConf()
conf.setAppName('amazonExample')
sc = SparkContext(conf=conf)

gateway         = sc._gateway
sym             = gateway.jvm.com.sml.shell

# Find the access keys for EC2.
awsAccessKeyId = os.environ['AWS_ACCESS_KEY']
awsSecretAccessKey = os.environ['AWS_SECRET_KEY']
# print("awsAccessKeyId=" + awsAccessKeyId)
# print("awsSecretAccessKey=" + awsSecretAccessKey)

sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey)

myrdd  = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv')
示例#6
0
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit


def stop_my_spark():
    sc.stop()
    del (sc)


# Register exit
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark')  ## you may want to change this
    conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version,
                                                    sc.applicationId)
示例#7
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
def initialize():
    global sc, spark, items, inputfile, t, m, gidDict, bids, hashedList, n, b, r, candidateTuple, listvala, listvalb
    print("Initializing...")

    t = time.time()
    candidateList = []
    frequentList = []
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)
    #  column name is userid, businessid, starts

    userids = list(set(sorted(items.keys().collect())))
    k = 0
    for user in userids:
        if (user not in gidDict):
            gidDict[user] = k
            k = k + 1
    # print(k)

    bids = list(set(sorted(items.values().collect())))
    # bids = copy.copy(sorted(bids))
    # print(len(bids))
    m = len(userids)
    listvala = random.sample(range(1, m), n)
    listvalb = random.sample(range(1, m), n)
    bid_uid = items.map(lambda x: ((x[1], x[0]), 1)).reduceByKey(
        lambda x, y: x + y).map(lambda x: (x[0])).groupByKey().mapValues(list)

    bid_uid_hashed = bid_uid.map(lambda x: initialHash(x))

    dict_uniques = {}
    for each in bid_uid.collect():
        dict_uniques[each[0]] = set(each[1])

    bid_uid_hashed2 = bid_uid_hashed.map(lambda x: hashing(x))
    # print(bid_uid_hashed2.first())

    # creating signature matrix column per business IDs
    start = 0
    end = r
    tempSim = []
    finalList = []
    hashedListSet = bid_uid_hashed2.collect()
    length = len(hashedListSet)
    c = 1
    print("Finding similar pairs...")

    dictionEvery = {}

    while (end <= n):
        tempDict = []
        for each in hashedListSet:
            templist = sorted(each[1][start:end])
            tempDict.append((tuple(templist), each[0]))
            # tempDict.append((tuple(each[1][start:end]), each[0]))
        dictionEvery[c] = tempDict
        c = c + 1
        start = end
        end = end + r
    dictionaryCheck = {}

    # for i in range(1, b+1):
    #     dictionaryCheck = {}
    #     for i in range(0, )

    length = len(dictionEvery[1])
    candidateset = []
    candidateTuple = []
    print("Working on Bands 1 to 40 ")
    for i in range(1, b + 1):
        justBid = []
        dictionBand = dictionEvery[i]
        # print("Working on Band: "+str(i))
        mapper = sc.parallelize(dictionBand).groupByKey().mapValues(
            list).filter(lambda x: (len(x[1]) > 1))

        justBid = mapper.map(lambda c: c[1]).collect()
        candidateTuple.append(justBid)
        # print(justBid)

    # print(len(candidateTuple[0]))
    # print(len(candidateTuple[1]))
    # print((candidateTuple[1]))

    candidateset = (candidateTuple)  # it was list(set(candidateTuple))

    candidatepairs = []
    count = 0
    for each in candidateset:
        for e in each:
            l1 = list(combinations(sorted(e), 2))
            candidatepairs.extend(l1)

    candPairSet = []  # set()

    candPairSet = (candidatepairs)  # it was list(set(candidatepairs))
    lines = []
    print("Finding final Jaccard Simmilarity")
    finalPairs = []
    for each in candPairSet:
        set1 = dict_uniques[each[0]]
        set2 = dict_uniques[each[1]]
        inter = set1 & set2
        # print(len(inter), len(set1), len(set2))
        jaccard = (float(len(inter))) / (float(len(set1.union(set2))))
        # print(jaccard)
        if (jaccard >= 0.5):
            # print(jaccard)
            lines.append([each[0], each[1], jaccard])
            finalPairs.append(each)

    # print(len(list(set(finalPairs))))
    # print(len((finalPairs)))
    answer = writeToFile(lines)
    # calculatingPreRec(lines)
    print("Total Items Printed: " + str(answer))
    print("Duration: " + str(time.time() - t))
示例#9
0
def initialize():
    global sc, spark, inputfile, t, items, validationfile, dictUid, dictBid, list_unaccounted, dict_code_uid, dict_code_bid, t, case
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task2")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # ------------Reading evaluation data-----------
    csvread2 = sc.textFile(validationfile)
    columnName2 = csvread2.first().split(',')
    validationData = csvread2.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName2)

    # calling case 3:
    if case == 3:
        implement_case3(items, validationData)
        print("Duration: " + str(time.time() - t))
        return
    # calling case 2:
    if case == 2:
        implement_case2(items, validationData)
        print("Duration: " + str(time.time() - t))
        return
    # Ending case 2

    # ------------PreProcessing data for training the mode-----------
    if case == 1:
        bid_uid = items.map(lambda u: (u[0], u[1]))

        keys = list(set(bid_uid.keys().collect()))
        values = list(set(bid_uid.values().collect()))

        dictUid = dict(zip(keys, range(0, len(keys))))
        dictBid = dict(zip(values, range(0, len(values))))
        for k, v in dictUid.items():
            dict_code_uid[v] = k

        for k, v in dictBid.items():
            dict_code_bid[v] = k

        ratings = items.map(lambda l: Rating(int(dictUid[l[0]]),
                                             int(dictBid[l[1]]), float(l[2])))

        # Training the model on train data
        rank = 2
        lambd = 0.5
        numIterations = 10
        model = ALS.train(ratings, rank, numIterations, lambd)

        print("Total entries in validation data: " +
              str(len(validationData.collect())))
        # ----------------------Creating a map with integer values for users and business on validation test set-----------------

        test_on_validation = validationData.map(lambda p: mapData(p))
        #
        validationRating = test_on_validation.filter(
            lambda p: (p[0] == 1)).map(lambda r: (r[1][0], r[1][1], r[1][2]))

        accountedPairs = test_on_validation.filter(lambda p: (p[0] == 1)).map(
            lambda r: (r[1][0], r[1][1]))

        UnaccountedPairs = test_on_validation.filter(lambda p: p[0] == 0).map(
            lambda r: ((r[1][0], r[1][1]), 2.75))

        # print("Accounted Pairs: "+str(len(accountedPairs.collect())))

        # print("Unaccounted Pairs: "+str(len(UnaccountedPairs.collect())))
        # print(test_on_validation.count())
        # print("Unaccounted Pairs: "+str(len(list_unaccounted)))

        # ----------------------Evaluate the model on training data----------------------
        # testdata = ratings.map(lambda p: (p[0], p[1]))
        # predictions = model.predictAll(testdata).map(8
        #     lambda r: ((r[0], r[1]), r[2]))
        # ratesAndPreds = ratings.map(lambda r: (
        #     (r[0], r[1]), r[2])).join(predictions)
        # MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

        # # import validation data

        # print("Mean Squared Error = " + str(MSE))

        # ----------------------Evaluate the model on testing data----------------------
        predictions = model.predictAll(accountedPairs).map(
            lambda r: ((r[0], r[1]), r[2]))
        # print(len(predictions.collect()))
        finalpred = predictions.union(UnaccountedPairs)
        # print(len(finalpred.collect()))
        # return
        # ratesAndPreds = validationRating.map(lambda r: (
        #     (r[0], r[1]), r[2])).join(predictions)
        ratesAndPreds = validationRating.map(
            lambda r: ((r[0], r[1]), r[2])).join(finalpred)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        writeToFile(finalpred)
        rmse = math.sqrt(MSE)
        print("Root Mean Squared Error = " + str(rmse))
    print("Duration: " + str(time.time() - t))
示例#10
0
#
# This configuration works for Spark on macOS using homebrew
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit
def stop_my_spark():
    sc.stop()
    del(sc)

# Register exit    
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark') ## you may want to change this
    conf.setMaster('yarn-client')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)
    print "http://arc.insight.gsu.edu:8088/cluster/app/%s"% (sc.applicationId)