import sys from pyspark import SparkContext from pyspark import SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.context import SQLContext from pyspark.rdd import RDD import pyspark if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr) exit(-1) conf = pyspark.SparkConf().set('spark.driver.host', '127.0.0.1') sc = pyspark.SparkContext(master='local[*]', appName='PythonStreamingKafkaWordCount', conf=conf) ssc = StreamingContext(sc, 1) # sc = SparkContext(appName="PythonStreamingKafkaWordCount") # ssc = StreamingContext(sc, 1) zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() # counts = lines.flatMap(lambda line: line.split(" ")) \ # .map(lambda word: (word, 1)) \ # .reduceByKey(lambda a, b: a+b) # counts.pprint() ssc.start()
# hyper parameter for hash function LENGTH_BIT_ARRAY = 10000 # NUM_HASH = 8 # optimal k = (n / m) * ln2 HASH_PARA_A = [1, 2, 3, 5, 7, 11, 13, 17] HASH_PARA_B = [23, 7717, 5837, 8147, 874, 457, 3529, 15] if __name__ == "__main__": start_time = time.time() # parse commandline argument first_json_path = sys.argv[1] second_json_path = sys.argv[2] output_file_path = sys.argv[3] conf = pyspark.SparkConf().setAppName("Task1").setMaster("local[*]") sc = pyspark.SparkContext(conf=conf) sc.setLogLevel("ERROR") cityRDD = sc.textFile(first_json_path).map(lambda x: json.loads(x)).map( lambda x: x["city"]) city_set = set(cityRDD.distinct().collect()) city_set.discard('') bit_array = [0 for _ in range(LENGTH_BIT_ARRAY)] for city in city_set: hashcodes = [(a * int(binascii.hexlify(city.encode("utf8")), 16) + b) % LENGTH_BIT_ARRAY for a, b in zip(HASH_PARA_A, HASH_PARA_B)] for hashcode in hashcodes:
def spark(): conf = pyspark.SparkConf() return get_spark_session(conf)
import pyspark as ps from pyspark.sql import SQLContext conf = ps.SparkConf().setMaster("local[4]").setAppName("p5_spark") sc = ps.SparkContext(conf=conf) textPath = 'Meteorite_Landings.csv' sqlContext = SQLContext(sc) df = sqlContext.read.csv(textPath) # Extraemos la columna nameType y Mass dataDF = df.select("_c2", "_c4") NoneType = type(None) dataRDD = dataDF.rdd.filter(lambda x: type(x._c2) != NoneType and type(x._c4) != NoneType) dataRDD = dataRDD.map(lambda x: (x._c2.encode('utf-8').split("-")[0], float(x._c4.encode('utf-8')))) dataPerType = dataRDD.groupByKey().map(lambda x: (x[0], list(x[1]))) sumDataPerType = dataPerType.map(lambda x: (x[0], sum(x[1]))) elementsPerData = dataPerType.map(lambda x: (x[0], len(x[1]))) elementsJoined = sumDataPerType.join(elementsPerData) averagePerType = elementsJoined.map(lambda x: (x[0], x[1][0]/x[1][1])) print(averagePerType.sortByKey().collect())
import findspark spark_home="C:\Users\our\Downloads\spark-1.6.0-bin-hadoop2.6" findspark.init(spark_home) import pyspark conf=pyspark.SparkConf() conf=pyspark.SparkConf().setAppName("myApp") sc = pyspark.SparkContext(conf=conf) import oauth2 import os import urllib import json import codecs def getKey(keyPath): d=dict() f=open(keyPath,'r') lines=f.readlines() for line in lines: row=line.split('=') row0=row[0] d[row0] =row[1].strip() return d keyPath=os.path.join(os.getcwd(),'src','key.properties') key=getKey(keyPath) consumer = oauth2.Consumer(key=key['api_key'], secret=key['api_secret']) token=oauth2.Token(key=key['access_token'], secret=key['access_secret']) client = oauth2.Client(consumer,token)
import pyspark import pyspark.sql if __name__ == '__main__': config = pyspark.SparkConf().setAppName("Basico") sc = pyspark.SparkContext(conf=config) sqlCtx = pyspark.sql.SQLContext(sc) dfVuelos = sqlCtx.read.csv('hdfs://localhost:9000/datos', header=True) dfVuelos.printSchema() dfVuelos.createOrReplaceTempView("VUELOS") sqlCtx.sql(""" select flightNum, SUM(cast(distance as int)) from VUELOS group by flightNum """).show()
LanguageCode='en') if len(respond['ErrorList']) != 0: error_index = set() for the_error in respond['ErrorList']: error_index.append(the_error['Index']) for i in range(25): if i not in error_index: filtered.append((json_data["created_utc"], json_data["subreddit"], \ (respond['ResultList'][i]['Sentiment'], respond['ResultList'][i]['SentimentScore']))) else: for i in range(25): filtered.append((json_data["created_utc"], json_data["subreddit"], \ (respond['ResultList'][i]['Sentiment'], respond['ResultList'][i]['SentimentScore']))) batch = [] return filtered if __name__ == "__main__": num_cores = 8 num_partitions = num_cores * 100 conf = pyspark.SparkConf().setAppName("RedditDataLoder") sc = pyspark.SparkContext(conf=conf) data = sc.textFile( data_path, minPartitions=num_partitions).mapPartitions(filter_patition) lenngths = data.collect() print(len(lenngths), ' ', lenngths[:100]) # data = sc.textFile(data_path, minPartitions=num_partitions) # print(data.count())
'vehicle.py', 'vehicle_stats.py', 'vehicle_utils.py', 'get_dist_on_battery.py', 'get_charged_stats.py', 'get_total_hours.py', 'get_average_speed.py', 'get_distance_driven.py', 'treat_missing_data.py', 'get_energy_n_capacity.py', 'get_geo_stats.py', 'COL_DEPENDENCY_DICT.py' ]: zf.write('stats-core/' + f, f) zf.write('configs.py') zf.close() # configs conf = pyspark.SparkConf().setAll([('spark.app.name', 'guobiao_daily_stats_run'), ('spark.master', 'yarn'), ('spark.submit.deployMode', 'client'), ('spark.executor.memory', '5g'), ('spark.memory.fraction', '0.7'), ('spark.executor.cores', '3'), ('spark.executor.instances', '10'), ('spark.yarn.am.memory', '10g')]) conf1 = pyspark.SparkConf().setAll([('spark.app.name', 'guobiao_export_to_hive'), ('spark.master', 'local'), ('spark.executor.memory', '5g'), ('spark.memory.fraction', '0.7'), ('spark.executor.cores', '3')]) COL_NUM_DICT = { 1: 'vintype', 2: 'ts', 4: 'veh_charge_st',
from pyspark.ml.feature import FeatureHasher from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml import Pipeline from pyspark.sql import functions as F ####################################################################################### # CONFIGURATIONS # Get current cluster setup from work directory STREAMING_WINDOW = 60 # Initialize PySpark SPARK_MASTER = "local[1]" #SPARK_MASTER="spark://mpp3r03c04s06.cos.lrz.de:7077" APP_NAME = "PySpark Lecture" os.environ[ "PYSPARK_PYTHON"] = "/naslx/projects/pn69si/mnmda001/software/anaconda3/bin/python" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import pyspark import pyspark.sql conf = pyspark.SparkConf().set("spark.cores.max", "4") sc = pyspark.SparkContext(master=SPARK_MASTER, conf=conf) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() print("PySpark initiated...")
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize SparkContext and SparkSession import findspark, os findspark.init('/home/hadoop/spark') import pyspark from pyspark.sql import SparkSession conf = pyspark.SparkConf().setMaster("yarn").setAppName("Jupyter PySpark Test") sc = pyspark.SparkContext(conf=conf) spark = SparkSession(sc) # In[2]: # Read title.basics.tsv into Spark dataframe imdb_title_basics_dataframe = spark.read.format('csv').options( header='true', delimiter='\t', nullValue='null', inferschema='true').load('/user/hadoop/imdb/title_basics/title.basics.tsv') # In[3]: imdb_title_basics_dataframe.printSchema( ) # Print Schema of title_basics dataframe # In[4]: imdb_title_basics_dataframe.show( 5) # Show first 5 rows of title_basics dataframe
import pyspark import os import sys import argparse import importlib if os.path.exists('text_reuse_pipline.zip'): sys.path.insert(0, 'text_reuse_pipline.zip') else: sys.path.insert(0, './text_reuse_pipline.zip') parser = argparse.ArgumentParser() parser.add_argument('--job', type=str, required=True) parser.add_argument('--job_args', nargs='*') args = parser.parse_args() print(args) conf = (pyspark.SparkConf().setAppName(args.job).set("spark.network.timeout", 300)\ .set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops -XX:+UseG1GC -XX:+UseStringDeduplication -Dio.netty.leakDetection.level=advanced")\ .set('spark.dynamicAllocation.enabled', False) ) sc = pyspark.SparkContext(conf=conf) sqlC = pyspark.SQLContext(sc) job_module = importlib.import_module('text_reuse_pipline.jobs.%s' % args.job) job_module.run(sc, args.job_args)
import time import pyspark conf = pyspark.SparkConf().setAppName("Dijkstra").set("spark.dynamicAllocation.enabled", "false") sc = pyspark.SparkContext(conf=conf) log4jLogger = sc._jvm.org.apache.log4j logger = log4jLogger.LogManager.getLogger(__name__) # helper functions # # def read_generated_graph_line(line): line = line.strip().split("\t") if len(line) == 2: return elif len(line) == 3: origin = line[0] neighbours = line[2] try: return [(origin, (pair.split(":")[0].strip(), int(pair.split(":")[1].strip()))) for pair in neighbours.split(",")] except IndexError: raise RuntimeError("file not well formatted") else: return def shortest_path_to_point(x, y): """ this function is a reduce function that computes the shortest path to a certain point (the key)"""
import datetime import matplotlib.pyplot as plt import pandas as pd import pickle5 as pickle import os from scipy import spatial import pyspark from pyspark.sql import SQLContext from pyspark.sql import SparkSession from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType #Configurazione del progetto: set del nome app, set a 6g della memoria dedicata ai driver e set della posizione del master conf = (pyspark.SparkConf().setAppName('test').set("spark.driver.memory", "6g").setMaster("local[*]")) #Passaggio della configurazione al contesto di spark sc = pyspark.SparkContext(master='local', appName='myAppName', conf=conf) sc.setLogLevel("ERROR") #Creazione di una sessione in spark spark = SparkSession.builder.appName('read').getOrCreate() #Aggiunta del file contenente il database a spark sc.addPyFile("jars/graphframes-0.8.0-spark3.0-s_2.12.jar") from graphframes import * """ create the df of papers by folder authors with: -id -year -list of authors """
def main(arguments): """Begin running the the modeller.""" loggers = logger.get_logger() # set up the spark configuration loggers.debug("Connecting to Spark") conf = (pyspark.SparkConf().setAppName("JiminyModeler").set( 'spark.executor.memory', '1G').set('spark.driver.memory', '1G').set('spark.driver.maxResultSize', '1G')) # get the spark context spark = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate() sc = spark.sparkContext # set up SQL connection try: con = build_connection(arguments) except IOError: loggers.error("Could not connect to data store") sys.exit(1) # fetch the data from the db cursor = con.cursor() cursor.execute("SELECT * FROM ratings") ratings = cursor.fetchall() loggers.info("Fetched data from table") # create an RDD of the ratings data ratingsRDD = sc.parallelize(ratings) # getting the largest timestamp. We use this to determine new entries later max_timestamp = ratingsRDD.map(lambda x: x[4]).max() # remove the final column which contains the time stamps ratingsRDD = ratingsRDD.map(lambda x: (x[1], x[2], x[3])) # split the RDD into 3 sections: training, validation and testing estimator = modeller.Estimator(ratingsRDD) if get_arg('DISABLE_FAST_TRAIN', args.slowtrain) is True: loggers.warn("Any ALS parameters given on the command line will not" " be used in when fast train is disabled.") # basic parameter selection loggers.info('Using slow training method') parameters = estimator.run(ranks=[2, 4, 6, 8], lambdas=[0.01, 0.05, 0.09, 0.13], iterations=[2]) else: # override basic parameters for faster testing loggers.info('Using fast training method') parameters = { 'rank': arguments.rankval, 'lambda': arguments.lambdaval, 'iteration': arguments.itsval } # train the model model = modeller.Trainer(data=ratingsRDD, rank=parameters['rank'], iterations=parameters['iteration'], lambda_=parameters['lambda'], seed=42).train() loggers.info('Model has been trained') # write the model to model store model_version = 1 writer = storage.MongoDBModelWriter(sc=sc, uri=arguments.mongoURI) writer.write(model=model, version=1) loggers.info('Model version 1 written to model store') while True: # this loop should be at the heart of this application, it will # continually loop until killed by the orchestration engine. # in this loop it should generally do the following: # 1. check to see if it should create a new model # 2. if yes, create a new model. if no, continue looping # (perhaps with a delay) # 3. store new model # check to see if new model should be created # select the maximum time stamp from the ratings database cursor.execute( "SELECT timestamp FROM ratings ORDER BY timestamp DESC LIMIT 1;") checking_max_timestamp = cursor.fetchone()[0] loggers.info( "The latest timestamp = {}".format(checking_max_timestamp)) if checking_max_timestamp > max_timestamp: # build a new model # first, fetch all new ratings cursor.execute("SELECT * FROM ratings WHERE (timestamp > %s);", (max_timestamp, )) new_ratings = cursor.fetchall() max_timestamp = checking_max_timestamp new_ratingsRDD = sc.parallelize(new_ratings) new_ratingsRDD = new_ratingsRDD.map(lambda x: (x[1], x[2], x[3])) ratingsRDD = ratingsRDD.union(new_ratingsRDD) model_version += 1 loggers.info("Training model, version={}".format(model_version)) # train the model model = modeller.Trainer(data=ratingsRDD, rank=parameters['rank'], iterations=parameters['iteration'], lambda_=parameters['lambda'], seed=42).train() loggers.info("Model has been trained.") writer.write(model=model, version=model_version) loggers.info("Model version %f written to model store." % (model_version)) else: # sleep for 2 minutes loggers.info("sleeping for 120 seconds") time.sleep(120)
from pyspark import SparkContext, SparkConf import pyspark import time import sys from pyspark.mllib.clustering import KMeans, KMeansModel from sklearn.cluster import KMeans import numpy as np from numpy import array from math import sqrt #timer start start_time = time.time() #creating a spark context conf = pyspark.SparkConf().setMaster("local[*]").setAppName("bfr").setAll([ ('spark.executor.memory', '8g'), ('spark.executor.cores', '3'), ('spark.cores.max', '3'), ('spark.driver.memory', '8g') ]) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=conf) #creating a spark context #sc = SparkContext('local[*]','first') sc.setLogLevel('ERROR') #take command line inputs input_path = sys.argv[1] input_clusters = int(sys.argv[2]) output_path = sys.argv[3] #initalizing Discard set (DS), Compression set (CS), Retained set (RS), final clustering results list and intermediate results list discard_set = list()
import pyspark conf = pyspark.SparkConf().setAppName('test').setMaster('local[*]') SparkContext = pyspark.SparkContext(conf=conf) data = [1, 2, 3, 4, 5] distData = SparkContext.parallelize(data) distFile = SparkContext.textFile("file:///usr/local/spark/README.md") rdd = SparkContext.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)) rdd.saveAsSequenceFile("file:///home/zhao/文档/hahaha") sorted(SparkContext.sequenceFile("file:///home/zhao/文档/hahaha").collect()) conf = { "es.resource": "index/type" } # assume Elasticsearch is running on localhost defaults rdd = SparkContext.newAPIHadoopRDD( "org.elasticsearch.hadoop.mr.EsInputFormat", "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) rdd.first() lines = SparkContext.textFile("file:///usr/local/spark/README.md") lineLengths = lines.map(lambda s: len(s)) totalLength = lineLengths.reduce(lambda a, b: a + b) lineLengths.persist()
print("\nExample:") print("python link-triples-to-orgs.py " "/home/madis/IR/data/company-urls/company-urls.csv " "/home/madis/IR/data/microdata_from_warcs/skumatch " "/tmp/triples-to-companies") sys.exit(1) else: company_csv_list_loc = sys.argv[1] input_loc = sys.argv[2] output_loc = sys.argv[3] start = time.time() cluster = False s_conf = pyspark.SparkConf() s_conf.set("spark.executor.instances", "60") s_conf.set("spark.dynamicAllocation.enabled", "false") sc = pyspark.SparkContext(conf=s_conf) sc.setLogLevel("ERROR") sqlContext = pyspark.SQLContext(sc) if cluster: array = ["Not Disclosed - Visit www.internet.ee for webbased WHOIS"] company_data = sqlContext.read.format("org.apache.phoenix.spark").option("table", 'IR.STG_DOMAIN').option("fetchSize", "10000").option( "numPartitions", "5000").option("zkUrl", "ir-hadoop1,ir-hadoop2,ir-hadoop3:2181").load() company_df = company_data.select(company_data.DOMAIN, company_data.REG_CODE).filter(
#pip install graphframes #sc = SparkContext('local[*]', 'task1') import os import pyspark from pyspark import SparkContext from graphframes import * import random os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11") scConf = pyspark.SparkConf() \ .setAppName('hw4') \ .setMaster('local[3]') sc = SparkContext(conf = scConf) # sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) sc.setLogLevel('ERROR') N=7 #N=int(sys.argv[1]) input_file_path = '../../PycharmProjects/553hw4/ub_sample_data.csv' #input_file_path =sys.argv[2] textRDD = sc.textFile(input_file_path).persist() output_file_path = '../../PycharmProjects/553hw4/task1_1_ans.txt'
import pyspark import nltk from nltk.corpus import stopwords import string # word tokenizer def word_tokenize1(x): import nltk x = x.lower() return nltk.word_tokenize(x) conf = pyspark.SparkConf().setAppName('Lotr').setMaster('local') sc = pyspark.SparkContext(conf=conf) lotr1 = sc.textFile("spark/dataset/The Lord Of The Ring 1-The Fellowship Of The Ring_djvu.txt") lotr1_words = lotr1.flatMap(word_tokenize1) lotr1_words.take(40) stop_words=set(stopwords.words('english')) lotr1_words_filtered = lotr1_words.filter(lambda word : word[0] not in stop_words and word[0] != '') lotr1_words_filtered.take(10) list_punct='!()-[]{};:\'"\,<>./?@#$%^&*_~“’`' lotr1_words_filtered_np = lotr1_words_filtered.filter(lambda punct : punct not in list_punct) lotr1_words_filtered_np.take(10) text_Classifi = lotr1_words_filtered_np.flatMap(lambda x : nltk.FreqDist(x.split(",")).most_common()).map(lambda x: x).reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1], ascending = False)
import os import geocoder # Build spark session import findspark # spark location on namenode server findspark.init("/usr/hdp/current/spark2-client") import pyspark conf = pyspark.SparkConf().setAll([ ('spark.app.name', 'guobiao_tsp_tbls.trip_map'), # App Name ('spark.master', 'yarn'), # spark run mode: locally or remotely ('spark.submit.deployMode', 'client'), # deploy in yarn-client or yarn-cluster ('spark.executor.memory', '10g'), # memory allocated for each executor #('spark.memory.fraction', '0.7'), ('spark.executor.cores', '3'), # number of cores for each executor ('spark.executor.instances', '5'), # number of executors in total ('spark.driver.maxResultSize', '5g'), # Result size is large, need to increase from default of 1g ('spark.yarn.am.memory', '10g') ]) # memory for spark driver (application master) sc = pyspark.SparkContext.getOrCreate(conf=conf) from pyspark.sql import HiveContext # Hive context hc = HiveContext(sc) def GenerateTrips(sc):
def setUpClass(cls): conf = pyspark.SparkConf().setMaster('local[1]').setAppName("testing") cls.sc = pyspark.SparkContext(conf=conf) cls.sqlContext = pyspark.SQLContext(cls.sc)
from pyspark import SparkContext import pyspark from itertools import islice import datetime import os os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython3" os.environ["PYSPARK_PYTHON"] = "/usr/local/bin/python3" conf = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'), ('spark.driver.memory', '8g'), ('spark.driver.maxResultSize', '3g')]) sc = SparkContext("local", "PySpark Word Count Exmaple", conf=conf) def parse_data(stringa): year, month, day = stringa.split("-") return datetime.datetime(int(year), int(month), int(day)) words = sc.textFile("../prova.csv") \ .map(lambda line: line.split(",")) \ .mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)\ .filter(lambda line: int(line[7][:4]) >= 1998) \ .map(lambda line: (line[0],[float(line[2]),float(line[4]),float(line[5]),int(line[6]),parse_data(line[7])]))\ lowTheMin = words.reduceByKey(lambda a, b: a if a[1] < b[1] else b).map( lambda line: (line[0], line[1][1])) highTheMax = words.reduceByKey(lambda a, b: a if a[2] > b[2] else b).map( lambda line: (line[0], line[1][2])) volume_totale = words.reduceByKey( lambda a, b: [a[0], a[1], a[2], a[3] + b[3], a[4]]).map( lambda line: (line[0], line[1][3])) ticker_giorni = words.map(lambda linea: (linea[0], 1)).reduceByKey(
def __init__(self, session_name=None, session_id=0, master='local[*]', timezone=None, jars=None, packages=None, pyfiles=None, files=None, repositories=None, services=None, conf=None): #call base class # stop the previous instance, # register self a the new instance super().__init__('spark', session_name, session_id) # bundle all submit in a dictionary self.submit = { 'jars': [jars] if isinstance(jars, str) else jars or [], 'packages': [packages] if isinstance(packages, str) else packages or [], 'py-files': [pyfiles] if isinstance(pyfiles, str) else pyfiles or [], 'files': [files] if isinstance(files, str) else files or [], 'repositories': [repositories] if isinstance(repositories, str) else repositories or [], 'conf': [conf] if isinstance(conf, tuple) else conf or [], } # suppress INFO logging for java_gateway python_logging.getLogger('py4j.java_gateway').setLevel( python_logging.ERROR) # collect info self.set_info() # detect packages and configuration from services detected = self.detect_submit_params(services) # merge up with those passed with the init for k in self.submit.keys(): self.submit[k] = list(sorted(set(self.submit[k] + detected[k]))) #set submit args via env variable self.set_submit_args() # set other spark-related environment variables self.set_env_variables() # set spark conf object print(f"Connecting to spark master: {master}") conf = pyspark.SparkConf() self.set_conf_timezone(conf, timezone) # set session name conf.setAppName(session_name) # set master conf.setMaster(master) # config passed through the api call go via the config for c in self.submit['conf']: k, v, *_ = list(c) + [''] if isinstance(v, (bool, int, float, str)): conf.set(k, v) # stop the current session if running self._stop() # start spark spark_session = self.start_context(conf) # record the data in the engine object for debug and future references self.conf = YamlDict(dict(conf.getAll())) if spark_session: self.conf = dict( dict(spark_session.sparkContext.getConf().getAll())) # set version if spark is loaded self._version = spark_session.version print( f'Engine context {self.engine_type}:{self.version} successfully started' ) # store the spark session self.context = spark_session # session is running self.stopped = False
import os os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3' os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/local/bin/python3' from data_utils import get_data, load_ratings_data_with_spark from pyspark.sql import SparkSession import pyspark from pyspark.sql.functions import * # spark = SparkSession.builder \ # .master('local[*]') \ # .config("spark.driver.memory", "15g") \ # .appName('MovieRecommender') \ # .master("local[4]")\ # .getOrCreate() # conf = new SparkConf().set("spark.executor.memory", "4g") config = pyspark.SparkConf().setAll([('spark.driver.memory', '8g')]) spark = SparkSession.builder.config(conf=config).getOrCreate() size = '25m' get_data(size) # data is '25M' or '100k' use argparse # ds = load_ratings_data(size) ratings_df = load_ratings_data_with_spark(size, spark) ratings_df = ratings_df.drop('timestamp') train, test = ratings_df.randomSplit([0.75, 0.25]) ratings_df.unpersist() from pyspark.ml.recommendation import ALS model = ALS(maxIter=10, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', nonnegative=True,
def __init__(self, idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir, skip_logging_configuration, optimizer_iterations): super(SparkBackend, self).__init__() if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = pyspark.SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR') or os.environ.get( 'AZURE_SPARK') == '1': import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) if os.environ.get('AZURE_SPARK') == '1': print( 'AZURE_SPARK environment variable is set to "1", assuming you are in HDInsight.' ) # Setting extraClassPath in HDInsight overrides the classpath entirely so you can't # load the Scala standard library. Interestingly, setting extraClassPath is not # necessary in HDInsight. else: conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: pyspark.SparkContext._ensure_initialized(conf=conf) elif not quiet: sys.stderr.write( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: pyspark.SparkContext._ensure_initialized() self._gateway = pyspark.SparkContext._gateway self._jvm = pyspark.SparkContext._jvm hail_package = getattr(self._jvm, 'is').hail self._hail_package = hail_package self._utils_package_object = scala_package_object(hail_package.utils) jsc = sc._jsc.sc() if sc else None if idempotent: self._jbackend = hail_package.backend.spark.SparkBackend.getOrCreate( jsc, app_name, master, local, True, min_block_size, tmpdir, local_tmpdir) self._jhc = hail_package.HailContext.getOrCreate( self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) else: self._jbackend = hail_package.backend.spark.SparkBackend.apply( jsc, app_name, master, local, True, min_block_size, tmpdir, local_tmpdir) self._jhc = hail_package.HailContext.apply( self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) self._jsc = self._jbackend.sc() if sc: self.sc = sc else: self.sc = pyspark.SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext( self._jsc)) self._jspark_session = self._jbackend.sparkSession() self._spark_session = pyspark.sql.SparkSession(self.sc, self._jspark_session) # This has to go after creating the SparkSession. Unclear why. # Maybe it does its own patch? install_exception_handler() from hail.context import version py_version = version() jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") self._fs = None self._logger = None if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger(self._utils_package_object, 'localhost', 12888) self._jbackend.startProgressBar()
import findspark import pyspark import requests from pyspark.sql import SQLContext, HiveContext from pyspark.sql import functions as fn from pyspark.ml.feature import RegexTokenizer from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import CountVectorizer from pyspark.ml import Pipeline from pyspark.ml.feature import IDF from pyspark.ml.classification import LogisticRegression from pyspark.sql import SparkSession from pyspark.sql.functions import col conf = pyspark.SparkConf().\ setAppName('hva-data-scientist').\ setMaster('local[*]') sc = pyspark.SparkContext(conf=conf) sqlContext = HiveContext(sc) findspark.init() class Spark: def __init__(self): # Convert Pandas dataframe to PySpark dataframe. df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv") # df = sqlContext.createDataFrame(pandas_df) # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5)
'longitude': station['longitude'] } name = name[:-1] elif station['name'] in ['前海湾', '后海', '大剧院', '购物公园', '深圳北']: stations[name] = { 'latitude': station['latitude'], 'longitude': station['longitude'] } name = name + '站' stations[name] = { 'latitude': station['latitude'], 'longitude': station['longitude'] } import pyspark confi = pyspark.SparkConf() confi.set('spark.network.timeout', '240s') confi.set('spark.executor.memory', '1500m') confi.set('spark.driver.maxResultSize', '1500m') confi.set('spark.daemon.java.opts', '-Xmx=10000m') confi.set('spark.daemon.memory', '10g') confi.setMaster('local[20]') sc = pyspark.SparkContext(conf=confi) def map_function(i, source_id, des_id): mon = '09' days = ['05', '06', '09', '26', '28', '30'] if not os.path.exists('/home/XXX/Python_Output/trajectories/new_agent/' + str(i) + '/'): os.makedirs('/home/XXX/Python_Output/trajectories/new_agent/' +
import pyspark as ps conf = ps.SparkConf().setMaster("local").setAppName("parallelProcessing") sc = ps.SparkContext(conf=conf) data = [1, 2, 3, 4, 5] distData = sc.parallelize( data) # Create a distributed collection, Create a RDD (1 partition) distDataP = sc.parallelize( data, 3) # Slice the dataset into 3 partitions, 3 way parallelism print(distDataP.count()) print(distDataP.getNumPartitions()) print(distDataP.reduce(lambda x, y: x + y))
#!/usr/bin/env python # -*- coding: UTF-8 -*- import pyspark import os def doIt(): print "---------RESULT-----------" myRdd=spark.sparkContext\ .textFile(os.path.join("data","ds_spark_wiki.txt")) res=myRdd\ .flatMap(lambda x:x.split())\ .map(lambda x:(x,1))\ .reduceByKey(lambda x,y:x+y)\ .map(lambda x:(x[1],x[0]))\ .sortByKey(False)\ .take(10) for i in res: print i if __name__ == "__main__": myConf = pyspark.SparkConf() spark = pyspark.sql.SparkSession.builder\ .master("local")\ .appName("myApp")\ .config(conf=myConf)\ .getOrCreate() doIt() spark.stop()
emotion = "" loudness = abs(an.loudness(inputdata)) filename = filename.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] emotion = ord(emotion[0]) + ord(emotion[1]) else: emotion = filename[0] emotion = float(ord(emotion)) / 100 return [float(loudness), float(pitch), emotion] working_directory = os.getcwd() working_directory = working_directory + "/" configuartion = py.SparkConf() # setting the Spark Configuration sContext = py.SparkContext(conf=configuartion) # setting the Spark context sContext.defaultParallelism print("Data preprocessing start time:", datetime.datetime.now().time()) testdata = sContext.parallelize( gb.glob( "/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/DC/*" )).map(dataconverter) data = testdata.map(getData) print("Data preprocessing end time:", datetime.datetime.now().time()) print data.take(10) # data1 = sContext.textFile(working_directory+"Test-TrainingData_SVM.csv") # #print testdata.count() # # # parsedData = data1.map(parsePoint)