예제 #1
0
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.context import SQLContext
from pyspark.rdd import RDD
import pyspark

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
        exit(-1)
    conf = pyspark.SparkConf().set('spark.driver.host', '127.0.0.1')
    sc = pyspark.SparkContext(master='local[*]', appName='PythonStreamingKafkaWordCount', conf=conf)
    ssc = StreamingContext(sc, 1)

    # sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    # ssc = StreamingContext(sc, 1)

    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.pprint()

    # counts = lines.flatMap(lambda line: line.split(" ")) \
    #               .map(lambda word: (word, 1)) \
    #               .reduceByKey(lambda a, b: a+b)
    # counts.pprint()

    ssc.start()
예제 #2
0
# hyper parameter for hash function
LENGTH_BIT_ARRAY = 10000
# NUM_HASH = 8 # optimal k = (n / m) * ln2
HASH_PARA_A = [1, 2, 3, 5, 7, 11, 13, 17]
HASH_PARA_B = [23, 7717, 5837, 8147, 874, 457, 3529, 15]

if __name__ == "__main__":

    start_time = time.time()

    # parse commandline argument
    first_json_path = sys.argv[1]
    second_json_path = sys.argv[2]
    output_file_path = sys.argv[3]

    conf = pyspark.SparkConf().setAppName("Task1").setMaster("local[*]")
    sc = pyspark.SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    cityRDD = sc.textFile(first_json_path).map(lambda x: json.loads(x)).map(
        lambda x: x["city"])
    city_set = set(cityRDD.distinct().collect())
    city_set.discard('')

    bit_array = [0 for _ in range(LENGTH_BIT_ARRAY)]

    for city in city_set:
        hashcodes = [(a * int(binascii.hexlify(city.encode("utf8")), 16) + b) %
                     LENGTH_BIT_ARRAY
                     for a, b in zip(HASH_PARA_A, HASH_PARA_B)]
        for hashcode in hashcodes:
예제 #3
0
def spark():
    conf = pyspark.SparkConf()
    return get_spark_session(conf)
예제 #4
0
import pyspark as ps
from pyspark.sql import SQLContext

conf = ps.SparkConf().setMaster("local[4]").setAppName("p5_spark")
sc = ps.SparkContext(conf=conf)


textPath = 'Meteorite_Landings.csv'
sqlContext = SQLContext(sc)
df = sqlContext.read.csv(textPath)

# Extraemos la columna nameType y Mass
dataDF = df.select("_c2", "_c4")

NoneType = type(None)
dataRDD = dataDF.rdd.filter(lambda x: type(x._c2) != NoneType and type(x._c4) != NoneType)
dataRDD = dataRDD.map(lambda x: (x._c2.encode('utf-8').split("-")[0], float(x._c4.encode('utf-8'))))

dataPerType = dataRDD.groupByKey().map(lambda x: (x[0], list(x[1])))
sumDataPerType = dataPerType.map(lambda x: (x[0], sum(x[1])))
elementsPerData = dataPerType.map(lambda x: (x[0], len(x[1])))

elementsJoined = sumDataPerType.join(elementsPerData)
averagePerType = elementsJoined.map(lambda x: (x[0], x[1][0]/x[1][1]))

print(averagePerType.sortByKey().collect())
예제 #5
0
import findspark
spark_home="C:\Users\our\Downloads\spark-1.6.0-bin-hadoop2.6"
findspark.init(spark_home)

import pyspark
conf=pyspark.SparkConf()
conf=pyspark.SparkConf().setAppName("myApp")
sc = pyspark.SparkContext(conf=conf)

import oauth2
import os
import urllib
import json
import codecs

def getKey(keyPath):
    d=dict()
    f=open(keyPath,'r')
    lines=f.readlines()
    for line in lines:
        row=line.split('=')
        row0=row[0]
        d[row0] =row[1].strip()
    return d

keyPath=os.path.join(os.getcwd(),'src','key.properties')
key=getKey(keyPath)
consumer = oauth2.Consumer(key=key['api_key'], secret=key['api_secret'])
token=oauth2.Token(key=key['access_token'], secret=key['access_secret'])
client = oauth2.Client(consumer,token)
예제 #6
0
import pyspark
import pyspark.sql

if __name__ == '__main__':
    config = pyspark.SparkConf().setAppName("Basico")
    sc = pyspark.SparkContext(conf=config)

    sqlCtx = pyspark.sql.SQLContext(sc)

    dfVuelos = sqlCtx.read.csv('hdfs://localhost:9000/datos', header=True)

    dfVuelos.printSchema()
    dfVuelos.createOrReplaceTempView("VUELOS")

    sqlCtx.sql("""
        select flightNum, SUM(cast(distance as int))
        from VUELOS
        group by flightNum
        """).show()
                                                    LanguageCode='en')
            if len(respond['ErrorList']) != 0:
                error_index = set()
                for the_error in respond['ErrorList']:
                    error_index.append(the_error['Index'])
                for i in range(25):
                    if i not in error_index:
                        filtered.append((json_data["created_utc"], json_data["subreddit"], \
                            (respond['ResultList'][i]['Sentiment'], respond['ResultList'][i]['SentimentScore'])))
            else:
                for i in range(25):
                    filtered.append((json_data["created_utc"], json_data["subreddit"], \
                        (respond['ResultList'][i]['Sentiment'], respond['ResultList'][i]['SentimentScore'])))

            batch = []

    return filtered


if __name__ == "__main__":
    num_cores = 8
    num_partitions = num_cores * 100
    conf = pyspark.SparkConf().setAppName("RedditDataLoder")
    sc = pyspark.SparkContext(conf=conf)

    data = sc.textFile(
        data_path, minPartitions=num_partitions).mapPartitions(filter_patition)
    lenngths = data.collect()
    print(len(lenngths), ' ', lenngths[:100])
    # data = sc.textFile(data_path, minPartitions=num_partitions)
    # print(data.count())
예제 #8
0
        'vehicle.py', 'vehicle_stats.py', 'vehicle_utils.py',
        'get_dist_on_battery.py', 'get_charged_stats.py', 'get_total_hours.py',
        'get_average_speed.py', 'get_distance_driven.py',
        'treat_missing_data.py', 'get_energy_n_capacity.py',
        'get_geo_stats.py', 'COL_DEPENDENCY_DICT.py'
]:
    zf.write('stats-core/' + f, f)
zf.write('configs.py')
zf.close()

# configs
conf = pyspark.SparkConf().setAll([('spark.app.name',
                                    'guobiao_daily_stats_run'),
                                   ('spark.master', 'yarn'),
                                   ('spark.submit.deployMode', 'client'),
                                   ('spark.executor.memory', '5g'),
                                   ('spark.memory.fraction', '0.7'),
                                   ('spark.executor.cores', '3'),
                                   ('spark.executor.instances', '10'),
                                   ('spark.yarn.am.memory', '10g')])
conf1 = pyspark.SparkConf().setAll([('spark.app.name',
                                     'guobiao_export_to_hive'),
                                    ('spark.master', 'local'),
                                    ('spark.executor.memory', '5g'),
                                    ('spark.memory.fraction', '0.7'),
                                    ('spark.executor.cores', '3')])

COL_NUM_DICT = {
    1: 'vintype',
    2: 'ts',
    4: 'veh_charge_st',
from pyspark.ml.feature import FeatureHasher
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

#######################################################################################
# CONFIGURATIONS
# Get current cluster setup from work directory
STREAMING_WINDOW = 60

# Initialize PySpark
SPARK_MASTER = "local[1]"
#SPARK_MASTER="spark://mpp3r03c04s06.cos.lrz.de:7077"
APP_NAME = "PySpark Lecture"
os.environ[
    "PYSPARK_PYTHON"] = "/naslx/projects/pn69si/mnmda001/software/anaconda3/bin/python"

# If there is no SparkSession, create the environment
try:
    sc and spark
except NameError as e:
    import pyspark
    import pyspark.sql
    conf = pyspark.SparkConf().set("spark.cores.max", "4")
    sc = pyspark.SparkContext(master=SPARK_MASTER, conf=conf)
    spark = pyspark.sql.SparkSession(sc).builder.appName(
        APP_NAME).getOrCreate()

print("PySpark initiated...")
예제 #10
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

# Initialize SparkContext and SparkSession
import findspark, os
findspark.init('/home/hadoop/spark')
import pyspark
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setMaster("yarn").setAppName("Jupyter PySpark Test")
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

# In[2]:

# Read title.basics.tsv into Spark dataframe
imdb_title_basics_dataframe = spark.read.format('csv').options(
    header='true', delimiter='\t', nullValue='null',
    inferschema='true').load('/user/hadoop/imdb/title_basics/title.basics.tsv')

# In[3]:

imdb_title_basics_dataframe.printSchema(
)  # Print Schema of title_basics dataframe

# In[4]:

imdb_title_basics_dataframe.show(
    5)  # Show first 5 rows of title_basics dataframe
예제 #11
0
파일: main.py 프로젝트: webis-de/ECIR-19
import pyspark
import os
import sys
import argparse
import importlib

if os.path.exists('text_reuse_pipline.zip'):
    sys.path.insert(0, 'text_reuse_pipline.zip')
else:
    sys.path.insert(0, './text_reuse_pipline.zip')

parser = argparse.ArgumentParser()
parser.add_argument('--job', type=str, required=True)
parser.add_argument('--job_args', nargs='*')
args = parser.parse_args()

print(args)

conf = (pyspark.SparkConf().setAppName(args.job).set("spark.network.timeout", 300)\
    .set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops -XX:+UseG1GC -XX:+UseStringDeduplication -Dio.netty.leakDetection.level=advanced")\
    .set('spark.dynamicAllocation.enabled', False)
        )
sc = pyspark.SparkContext(conf=conf)
sqlC = pyspark.SQLContext(sc)

job_module = importlib.import_module('text_reuse_pipline.jobs.%s' % args.job)
job_module.run(sc, args.job_args)
import time

import pyspark

conf = pyspark.SparkConf().setAppName("Dijkstra").set("spark.dynamicAllocation.enabled", "false")
sc = pyspark.SparkContext(conf=conf)
log4jLogger = sc._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger(__name__)


# helper functions
#
#
def read_generated_graph_line(line):
    line = line.strip().split("\t")
    if len(line) == 2:
        return
    elif len(line) == 3:
        origin = line[0]
        neighbours = line[2]
        try:
            return [(origin, (pair.split(":")[0].strip(), int(pair.split(":")[1].strip())))
                    for pair in neighbours.split(",")]
        except IndexError:
            raise RuntimeError("file not well formatted")
    else:
        return


def shortest_path_to_point(x, y):
    """ this function is a reduce function that computes the shortest path to a certain point (the key)"""
예제 #13
0
import datetime

import matplotlib.pyplot as plt
import pandas as pd
import pickle5 as pickle
import os
from scipy import spatial

import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

#Configurazione del progetto: set del nome app, set a 6g della memoria dedicata ai driver e set della posizione del master
conf = (pyspark.SparkConf().setAppName('test').set("spark.driver.memory", "6g").setMaster("local[*]"))
#Passaggio della configurazione al contesto di spark
sc = pyspark.SparkContext(master='local', appName='myAppName', conf=conf)
sc.setLogLevel("ERROR")
#Creazione di una sessione in spark
spark = SparkSession.builder.appName('read').getOrCreate()
#Aggiunta del file contenente il database a spark
sc.addPyFile("jars/graphframes-0.8.0-spark3.0-s_2.12.jar")
from graphframes import *

"""
create the df of papers by folder authors with:
-id
-year 
-list of authors
"""
예제 #14
0
def main(arguments):
    """Begin running the the modeller."""
    loggers = logger.get_logger()
    # set up the spark configuration
    loggers.debug("Connecting to Spark")
    conf = (pyspark.SparkConf().setAppName("JiminyModeler").set(
        'spark.executor.memory',
        '1G').set('spark.driver.memory',
                  '1G').set('spark.driver.maxResultSize', '1G'))
    # get the spark context
    spark = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate()
    sc = spark.sparkContext

    # set up SQL connection
    try:
        con = build_connection(arguments)
    except IOError:
        loggers.error("Could not connect to data store")
        sys.exit(1)

    # fetch the data from the db
    cursor = con.cursor()
    cursor.execute("SELECT * FROM ratings")
    ratings = cursor.fetchall()
    loggers.info("Fetched data from table")
    # create an RDD of the ratings data
    ratingsRDD = sc.parallelize(ratings)
    # getting the largest timestamp. We use this to determine new entries later
    max_timestamp = ratingsRDD.map(lambda x: x[4]).max()
    # remove the final column which contains the time stamps
    ratingsRDD = ratingsRDD.map(lambda x: (x[1], x[2], x[3]))
    # split the RDD into 3 sections: training, validation and testing
    estimator = modeller.Estimator(ratingsRDD)

    if get_arg('DISABLE_FAST_TRAIN', args.slowtrain) is True:
        loggers.warn("Any ALS parameters given on the command line will not"
                     " be used in when fast train is disabled.")
        # basic parameter selection
        loggers.info('Using slow training method')
        parameters = estimator.run(ranks=[2, 4, 6, 8],
                                   lambdas=[0.01, 0.05, 0.09, 0.13],
                                   iterations=[2])
    else:
        # override basic parameters for faster testing
        loggers.info('Using fast training method')
        parameters = {
            'rank': arguments.rankval,
            'lambda': arguments.lambdaval,
            'iteration': arguments.itsval
        }

    # train the model
    model = modeller.Trainer(data=ratingsRDD,
                             rank=parameters['rank'],
                             iterations=parameters['iteration'],
                             lambda_=parameters['lambda'],
                             seed=42).train()
    loggers.info('Model has been trained')
    # write the model to model store
    model_version = 1
    writer = storage.MongoDBModelWriter(sc=sc, uri=arguments.mongoURI)
    writer.write(model=model, version=1)
    loggers.info('Model version 1 written to model store')

    while True:
        # this loop should be at the heart of this application, it will
        # continually loop until killed by the orchestration engine.
        # in this loop it should generally do the following:
        # 1. check to see if it should create a new model
        # 2. if yes, create a new model. if no, continue looping
        #    (perhaps with a delay)
        # 3. store new model

        # check to see if new model should be created
        # select the maximum time stamp from the ratings database
        cursor.execute(
            "SELECT timestamp FROM ratings ORDER BY timestamp DESC LIMIT 1;")
        checking_max_timestamp = cursor.fetchone()[0]
        loggers.info(
            "The latest timestamp = {}".format(checking_max_timestamp))

        if checking_max_timestamp > max_timestamp:
            # build a new model
            # first, fetch all new ratings
            cursor.execute("SELECT * FROM ratings WHERE (timestamp > %s);",
                           (max_timestamp, ))
            new_ratings = cursor.fetchall()
            max_timestamp = checking_max_timestamp
            new_ratingsRDD = sc.parallelize(new_ratings)
            new_ratingsRDD = new_ratingsRDD.map(lambda x: (x[1], x[2], x[3]))
            ratingsRDD = ratingsRDD.union(new_ratingsRDD)
            model_version += 1
            loggers.info("Training model, version={}".format(model_version))
            # train the model
            model = modeller.Trainer(data=ratingsRDD,
                                     rank=parameters['rank'],
                                     iterations=parameters['iteration'],
                                     lambda_=parameters['lambda'],
                                     seed=42).train()
            loggers.info("Model has been trained.")
            writer.write(model=model, version=model_version)
            loggers.info("Model version %f written to model store." %
                         (model_version))
        else:
            # sleep for 2 minutes
            loggers.info("sleeping for 120 seconds")
            time.sleep(120)
예제 #15
0
파일: BFR3.py 프로젝트: yashinil/Clustering
from pyspark import SparkContext, SparkConf
import pyspark
import time
import sys
from pyspark.mllib.clustering import KMeans, KMeansModel
from sklearn.cluster import KMeans
import numpy as np
from numpy import array
from math import sqrt

#timer start
start_time = time.time()

#creating a spark context
conf = pyspark.SparkConf().setMaster("local[*]").setAppName("bfr").setAll([
    ('spark.executor.memory', '8g'), ('spark.executor.cores', '3'),
    ('spark.cores.max', '3'), ('spark.driver.memory', '8g')
])
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)

#creating a spark context
#sc = SparkContext('local[*]','first')
sc.setLogLevel('ERROR')

#take command line inputs
input_path = sys.argv[1]
input_clusters = int(sys.argv[2])
output_path = sys.argv[3]

#initalizing Discard set (DS), Compression set (CS), Retained set (RS), final clustering results list and intermediate results list
discard_set = list()
import pyspark

conf = pyspark.SparkConf().setAppName('test').setMaster('local[*]')
SparkContext = pyspark.SparkContext(conf=conf)

data = [1, 2, 3, 4, 5]
distData = SparkContext.parallelize(data)

distFile = SparkContext.textFile("file:///usr/local/spark/README.md")

rdd = SparkContext.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
rdd.saveAsSequenceFile("file:///home/zhao/文档/hahaha")
sorted(SparkContext.sequenceFile("file:///home/zhao/文档/hahaha").collect())

conf = {
    "es.resource": "index/type"
}  # assume Elasticsearch is running on localhost defaults
rdd = SparkContext.newAPIHadoopRDD(
    "org.elasticsearch.hadoop.mr.EsInputFormat",
    "org.apache.hadoop.io.NullWritable",
    "org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf=conf)
rdd.first()

lines = SparkContext.textFile("file:///usr/local/spark/README.md")
lineLengths = lines.map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)

lineLengths.persist()

예제 #17
0
		print("\nExample:")
		print("python link-triples-to-orgs.py "
			  "/home/madis/IR/data/company-urls/company-urls.csv "
			  "/home/madis/IR/data/microdata_from_warcs/skumatch "
			  "/tmp/triples-to-companies")
		sys.exit(1)
	else:
		company_csv_list_loc = sys.argv[1]
		input_loc = sys.argv[2]
		output_loc = sys.argv[3]

	start = time.time()

	cluster = False

	s_conf = pyspark.SparkConf()
	s_conf.set("spark.executor.instances", "60")
	s_conf.set("spark.dynamicAllocation.enabled", "false")

	sc = pyspark.SparkContext(conf=s_conf)
	sc.setLogLevel("ERROR")
	sqlContext = pyspark.SQLContext(sc)

	if cluster:
		array = ["Not Disclosed - Visit www.internet.ee for webbased WHOIS"]

		company_data = sqlContext.read.format("org.apache.phoenix.spark").option("table", 'IR.STG_DOMAIN').option("fetchSize",
																											"10000").option(
			"numPartitions", "5000").option("zkUrl", "ir-hadoop1,ir-hadoop2,ir-hadoop3:2181").load()

		company_df = company_data.select(company_data.DOMAIN, company_data.REG_CODE).filter(
예제 #18
0
#pip install graphframes

#sc = SparkContext('local[*]', 'task1')


import os
import pyspark
from pyspark import SparkContext
from graphframes import *
import random


os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11")

scConf = pyspark.SparkConf() \
    .setAppName('hw4') \
    .setMaster('local[3]')
sc = SparkContext(conf = scConf)

# sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
sc.setLogLevel('ERROR')

N=7
#N=int(sys.argv[1])

input_file_path = '../../PycharmProjects/553hw4/ub_sample_data.csv'
#input_file_path =sys.argv[2]
textRDD = sc.textFile(input_file_path).persist()

output_file_path = '../../PycharmProjects/553hw4/task1_1_ans.txt'
예제 #19
0
import pyspark
import nltk
from nltk.corpus import stopwords
import string

# word tokenizer
def word_tokenize1(x):
    import nltk
    x = x.lower()
    return nltk.word_tokenize(x)

conf = pyspark.SparkConf().setAppName('Lotr').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
lotr1 = sc.textFile("spark/dataset/The Lord Of The Ring 1-The Fellowship Of The Ring_djvu.txt") 

lotr1_words = lotr1.flatMap(word_tokenize1)

lotr1_words.take(40)


stop_words=set(stopwords.words('english'))
lotr1_words_filtered = lotr1_words.filter(lambda word : word[0] not in stop_words and word[0] != '')

lotr1_words_filtered.take(10)

list_punct='!()-[]{};:\'"\,<>./?@#$%^&*_~“’`'
lotr1_words_filtered_np = lotr1_words_filtered.filter(lambda punct : punct not in list_punct)

lotr1_words_filtered_np.take(10)

text_Classifi = lotr1_words_filtered_np.flatMap(lambda x : nltk.FreqDist(x.split(",")).most_common()).map(lambda x: x).reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1], ascending = False)
예제 #20
0
import os
import geocoder

# Build spark session
import findspark

# spark location on namenode server
findspark.init("/usr/hdp/current/spark2-client")
import pyspark
conf = pyspark.SparkConf().setAll([
    ('spark.app.name', 'guobiao_tsp_tbls.trip_map'),  # App Name
    ('spark.master', 'yarn'),  # spark run mode: locally or remotely
    ('spark.submit.deployMode',
     'client'),  # deploy in yarn-client or yarn-cluster
    ('spark.executor.memory', '10g'),  # memory allocated for each executor
    #('spark.memory.fraction', '0.7'),
    ('spark.executor.cores', '3'),  # number of cores for each executor
    ('spark.executor.instances', '5'),  # number of executors in total
    ('spark.driver.maxResultSize',
     '5g'),  # Result size is large, need to increase from default of 1g
    ('spark.yarn.am.memory', '10g')
])  # memory for spark driver (application master)
sc = pyspark.SparkContext.getOrCreate(conf=conf)

from pyspark.sql import HiveContext

# Hive context
hc = HiveContext(sc)


def GenerateTrips(sc):
예제 #21
0
 def setUpClass(cls):
     conf = pyspark.SparkConf().setMaster('local[1]').setAppName("testing")
     cls.sc = pyspark.SparkContext(conf=conf)
     cls.sqlContext = pyspark.SQLContext(cls.sc)
예제 #22
0
from pyspark import SparkContext
import pyspark
from itertools import islice
import datetime
import os
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython3"
os.environ["PYSPARK_PYTHON"] = "/usr/local/bin/python3"
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'),
                                   ('spark.driver.memory', '8g'),
                                   ('spark.driver.maxResultSize', '3g')])
sc = SparkContext("local", "PySpark Word Count Exmaple", conf=conf)


def parse_data(stringa):
    year, month, day = stringa.split("-")
    return datetime.datetime(int(year), int(month), int(day))

words = sc.textFile("../prova.csv") \
    .map(lambda line: line.split(",")) \
    .mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)\
    .filter(lambda line: int(line[7][:4]) >= 1998) \
    .map(lambda line: (line[0],[float(line[2]),float(line[4]),float(line[5]),int(line[6]),parse_data(line[7])]))\

lowTheMin = words.reduceByKey(lambda a, b: a if a[1] < b[1] else b).map(
    lambda line: (line[0], line[1][1]))
highTheMax = words.reduceByKey(lambda a, b: a if a[2] > b[2] else b).map(
    lambda line: (line[0], line[1][2]))
volume_totale = words.reduceByKey(
    lambda a, b: [a[0], a[1], a[2], a[3] + b[3], a[4]]).map(
        lambda line: (line[0], line[1][3]))
ticker_giorni = words.map(lambda linea: (linea[0], 1)).reduceByKey(
예제 #23
0
파일: engine.py 프로젝트: icrona/datafaucet
    def __init__(self,
                 session_name=None,
                 session_id=0,
                 master='local[*]',
                 timezone=None,
                 jars=None,
                 packages=None,
                 pyfiles=None,
                 files=None,
                 repositories=None,
                 services=None,
                 conf=None):

        #call base class
        # stop the previous instance,
        # register self a the new instance
        super().__init__('spark', session_name, session_id)

        # bundle all submit in a dictionary
        self.submit = {
            'jars': [jars] if isinstance(jars, str) else jars or [],
            'packages':
            [packages] if isinstance(packages, str) else packages or [],
            'py-files':
            [pyfiles] if isinstance(pyfiles, str) else pyfiles or [],
            'files': [files] if isinstance(files, str) else files or [],
            'repositories': [repositories]
            if isinstance(repositories, str) else repositories or [],
            'conf': [conf] if isinstance(conf, tuple) else conf or [],
        }

        # suppress INFO logging for java_gateway
        python_logging.getLogger('py4j.java_gateway').setLevel(
            python_logging.ERROR)

        # collect info
        self.set_info()

        # detect packages and configuration from services
        detected = self.detect_submit_params(services)

        # merge up with those passed with the init
        for k in self.submit.keys():
            self.submit[k] = list(sorted(set(self.submit[k] + detected[k])))

        #set submit args via env variable
        self.set_submit_args()

        # set other spark-related environment variables
        self.set_env_variables()

        # set spark conf object
        print(f"Connecting to spark master: {master}")

        conf = pyspark.SparkConf()
        self.set_conf_timezone(conf, timezone)

        # set session name
        conf.setAppName(session_name)

        # set master
        conf.setMaster(master)

        # config passed through the api call go via the config
        for c in self.submit['conf']:
            k, v, *_ = list(c) + ['']
            if isinstance(v, (bool, int, float, str)):
                conf.set(k, v)

        # stop the current session if running
        self._stop()

        # start spark
        spark_session = self.start_context(conf)

        # record the data in the engine object for debug and future references
        self.conf = YamlDict(dict(conf.getAll()))

        if spark_session:
            self.conf = dict(
                dict(spark_session.sparkContext.getConf().getAll()))

            # set version if spark is loaded
            self._version = spark_session.version
            print(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # store the spark session
            self.context = spark_session

            # session is running
            self.stopped = False
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/local/bin/python3'
from data_utils import get_data, load_ratings_data_with_spark
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import *
# spark = SparkSession.builder \
#     .master('local[*]') \
#     .config("spark.driver.memory", "15g") \
#     .appName('MovieRecommender') \
#     .master("local[4]")\
#     .getOrCreate()
# conf = new SparkConf().set("spark.executor.memory", "4g")
config = pyspark.SparkConf().setAll([('spark.driver.memory', '8g')])
spark = SparkSession.builder.config(conf=config).getOrCreate()
size = '25m'
get_data(size)  #  data is '25M' or '100k'  use argparse
# ds = load_ratings_data(size)
ratings_df = load_ratings_data_with_spark(size, spark)
ratings_df = ratings_df.drop('timestamp')
train, test = ratings_df.randomSplit([0.75, 0.25])
ratings_df.unpersist()
from pyspark.ml.recommendation import ALS
model = ALS(maxIter=10,
            regParam=0.01,
            userCol='userId',
            itemCol='movieId',
            ratingCol='rating',
            nonnegative=True,
예제 #25
0
    def __init__(self, idempotent, sc, spark_conf, app_name, master, local,
                 log, quiet, append, min_block_size, branching_factor, tmpdir,
                 local_tmpdir, skip_logging_configuration,
                 optimizer_iterations):
        super(SparkBackend, self).__init__()

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            conf = pyspark.SparkConf()

            base_conf = spark_conf or {}
            for k, v in base_conf.items():
                conf.set(k, v)

            jars = [hail_jar_path]

            if os.environ.get('HAIL_SPARK_MONITOR') or os.environ.get(
                    'AZURE_SPARK') == '1':
                import sparkmonitor
                jars.append(
                    os.path.join(os.path.dirname(sparkmonitor.__file__),
                                 'listener.jar'))
                conf.set("spark.extraListeners",
                         "sparkmonitor.listener.JupyterSparkMonitorListener")

            conf.set('spark.jars', ','.join(jars))
            if os.environ.get('AZURE_SPARK') == '1':
                print(
                    'AZURE_SPARK environment variable is set to "1", assuming you are in HDInsight.'
                )
                # Setting extraClassPath in HDInsight overrides the classpath entirely so you can't
                # load the Scala standard library. Interestingly, setting extraClassPath is not
                # necessary in HDInsight.
            else:
                conf.set('spark.driver.extraClassPath', ','.join(jars))
                conf.set('spark.executor.extraClassPath',
                         './hail-all-spark.jar')
            if sc is None:
                pyspark.SparkContext._ensure_initialized(conf=conf)
            elif not quiet:
                sys.stderr.write(
                    'pip-installed Hail requires additional configuration options in Spark referring\n'
                    '  to the path to the Hail Python module directory HAIL_DIR,\n'
                    '  e.g. /path/to/python/site-packages/hail:\n'
                    '    spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n'
                    '    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n'
                    '    spark.executor.extraClassPath=./hail-all-spark.jar')
        else:
            pyspark.SparkContext._ensure_initialized()

        self._gateway = pyspark.SparkContext._gateway
        self._jvm = pyspark.SparkContext._jvm

        hail_package = getattr(self._jvm, 'is').hail

        self._hail_package = hail_package
        self._utils_package_object = scala_package_object(hail_package.utils)

        jsc = sc._jsc.sc() if sc else None

        if idempotent:
            self._jbackend = hail_package.backend.spark.SparkBackend.getOrCreate(
                jsc, app_name, master, local, True, min_block_size, tmpdir,
                local_tmpdir)
            self._jhc = hail_package.HailContext.getOrCreate(
                self._jbackend, log, True, append, branching_factor,
                skip_logging_configuration, optimizer_iterations)
        else:
            self._jbackend = hail_package.backend.spark.SparkBackend.apply(
                jsc, app_name, master, local, True, min_block_size, tmpdir,
                local_tmpdir)
            self._jhc = hail_package.HailContext.apply(
                self._jbackend, log, True, append, branching_factor,
                skip_logging_configuration, optimizer_iterations)

        self._jsc = self._jbackend.sc()
        if sc:
            self.sc = sc
        else:
            self.sc = pyspark.SparkContext(gateway=self._gateway,
                                           jsc=self._jvm.JavaSparkContext(
                                               self._jsc))
        self._jspark_session = self._jbackend.sparkSession()
        self._spark_session = pyspark.sql.SparkSession(self.sc,
                                                       self._jspark_session)

        # This has to go after creating the SparkSession. Unclear why.
        # Maybe it does its own patch?
        install_exception_handler()

        from hail.context import version

        py_version = version()
        jar_version = self._jhc.version()
        if jar_version != py_version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {py_version}")

        self._fs = None
        self._logger = None

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger(self._utils_package_object, 'localhost', 12888)

            self._jbackend.startProgressBar()
예제 #26
0
import findspark
import pyspark
import requests
from pyspark.sql import SQLContext, HiveContext
from pyspark.sql import functions as fn
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

conf = pyspark.SparkConf().\
    setAppName('hva-data-scientist').\
    setMaster('local[*]')

sc = pyspark.SparkContext(conf=conf)
sqlContext = HiveContext(sc)

findspark.init()


class Spark:
    def __init__(self):
        # Convert Pandas dataframe to PySpark dataframe.
        df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv")
        # df = sqlContext.createDataFrame(pandas_df)

        # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5)
            'longitude': station['longitude']
        }
        name = name[:-1]
    elif station['name'] in ['前海湾', '后海', '大剧院', '购物公园', '深圳北']:
        stations[name] = {
            'latitude': station['latitude'],
            'longitude': station['longitude']
        }
        name = name + '站'
    stations[name] = {
        'latitude': station['latitude'],
        'longitude': station['longitude']
    }

import pyspark
confi = pyspark.SparkConf()
confi.set('spark.network.timeout', '240s')
confi.set('spark.executor.memory', '1500m')
confi.set('spark.driver.maxResultSize', '1500m')
confi.set('spark.daemon.java.opts', '-Xmx=10000m')
confi.set('spark.daemon.memory', '10g')
confi.setMaster('local[20]')
sc = pyspark.SparkContext(conf=confi)


def map_function(i, source_id, des_id):
    mon = '09'
    days = ['05', '06', '09', '26', '28', '30']
    if not os.path.exists('/home/XXX/Python_Output/trajectories/new_agent/' +
                          str(i) + '/'):
        os.makedirs('/home/XXX/Python_Output/trajectories/new_agent/' +
예제 #28
0
import pyspark as ps

conf = ps.SparkConf().setMaster("local").setAppName("parallelProcessing")
sc = ps.SparkContext(conf=conf)

data = [1, 2, 3, 4, 5]
distData = sc.parallelize(
    data)  # Create a distributed collection, Create a RDD (1 partition)
distDataP = sc.parallelize(
    data, 3)  # Slice the dataset into 3 partitions, 3 way parallelism

print(distDataP.count())
print(distDataP.getNumPartitions())
print(distDataP.reduce(lambda x, y: x + y))
예제 #29
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import pyspark
import os


def doIt():
    print "---------RESULT-----------"
    myRdd=spark.sparkContext\
        .textFile(os.path.join("data","ds_spark_wiki.txt"))
    res=myRdd\
        .flatMap(lambda x:x.split())\
        .map(lambda x:(x,1))\
        .reduceByKey(lambda x,y:x+y)\
        .map(lambda x:(x[1],x[0]))\
        .sortByKey(False)\
        .take(10)
    for i in res:
        print i


if __name__ == "__main__":
    myConf = pyspark.SparkConf()
    spark = pyspark.sql.SparkSession.builder\
        .master("local")\
        .appName("myApp")\
        .config(conf=myConf)\
        .getOrCreate()
    doIt()
    spark.stop()
예제 #30
0
    emotion = ""
    loudness = abs(an.loudness(inputdata))
    filename = filename.split("/")[-1].split(".")[0]
    if filename[0] == "s":
        emotion = filename[0:2]
        emotion = ord(emotion[0]) + ord(emotion[1])
    else:
        emotion = filename[0]
        emotion = float(ord(emotion)) / 100
    return [float(loudness), float(pitch), emotion]


working_directory = os.getcwd()
working_directory = working_directory + "/"

configuartion = py.SparkConf()  # setting the Spark Configuration
sContext = py.SparkContext(conf=configuartion)  # setting the Spark context
sContext.defaultParallelism
print("Data preprocessing start time:", datetime.datetime.now().time())
testdata = sContext.parallelize(
    gb.glob(
        "/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/DC/*"
    )).map(dataconverter)
data = testdata.map(getData)
print("Data preprocessing end time:", datetime.datetime.now().time())
print data.take(10)
# data1 = sContext.textFile(working_directory+"Test-TrainingData_SVM.csv")
#
#print testdata.count()
# #
# parsedData = data1.map(parsePoint)