def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars.packages", "graphframes:graphframes:0.8.0-spark3.0-s_2.12") # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "4g") conf.set("spark.sql.shuffle.partitions", "8") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction("Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()) return spark
def run_driver(keyspace, table): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ( { "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": {"visitor_id": "xyz"} }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def create_spark_session(app_name="SparkApplication_{}".format( datetime.utcfromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))): def init_calc_props(spark_ctx_conf): if {'parallelism-to-tasks-factor', 'total-executor-cores' } <= set(dict(spark_ctx_conf.getAll())): total_executor_cores = spark_utils.calc_max_cores( int(spark_ctx_conf.get('total-executor-cores'))) parallelism_to_tasks_factor = int( spark_ctx_conf.get('parallelism-to-tasks-factor')) value = str(total_executor_cores * parallelism_to_tasks_factor) logging.info( "total_executor_cores: {0}, parallelism_to_tasks_factor: {1},value: {2}" .format(total_executor_cores, parallelism_to_tasks_factor, value)) spark_ctx_conf.set('spark.default.parallelism', value) spark_ctx_conf.set('spark.sql.shuffle.partitions', value) logging.debug('Starting local spark with conf: {0}'.format( "\n".join(str(v) for v in spark_ctx_conf.getAll()))) from pyspark.sql import SparkSession from pyspark.context import SparkConf spark_conf = SparkConf() spark_conf.setAll(spark_conf_dict.items()) init_calc_props(spark_conf) sc = spark_utils.generate_spark_context(spark_conf=spark_conf) spark = SparkSession(sc) spark.sparkContext.setLogLevel("WARN") return spark
def initialize(): global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) buckets_user = items.groupByKey().mapValues(list).filter( lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex( removeDuplicateEntriesAfter) print("Without Duplicates DOne..") # withoutDuplicates = checkM.mapPartitionsWithIndex( # removeDuplicateEntries).groupByKey().mapValues(list) if (case == 1): # buckets_user = withoutDuplicates.mapPartitionsWithIndex( # createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold) callSonPhase1(buckets_user) print("Initializing Phase 2.....") finalFreq = buckets_user.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass if (case == 2): buckets_business = withoutDuplicates.mapPartitionsWithIndex( createBuckets_case2).groupByKey().mapValues(list) callSonPhase1(buckets_business) print("Initializing Phase 2.....") finalFreq = buckets_business.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass
def __init__(self, spark_home, spark_master="local", exec_memory="8g", app_name="SparkClient"): """ Initialize sparkcontext, sqlcontext :param spark_master: target spark master :param exec_memory: size of memory per executor """ self._spark_master = spark_master self._exec_memory = exec_memory self._app_name = app_name self._spark_home = spark_home # Path for spark source folder os.environ['SPARK_HOME'] = self._spark_home self._spark_url = spark_master if spark_master != "local": os.environ['SPARK_MASTER_IP'] = spark_master self._spark_url = "spark://" + self._spark_master + ":7077" # Append pyspark to Python Path sys.path.append(self._spark_home) # define the spark configuration conf = (SparkConf().setMaster( self._spark_url).setAppName(self._app_name).set( "spark.executor.memory", self._exec_memory).set( "spark.core.connection.ack.wait.timeout", "600").set("spark.akka.frameSize", "512").set( "spark.cassandra.output.batch.size.bytes", "131072")) # create spark context self._spark_ctx = None if SparkContext._active_spark_context is None: self._spark_ctx = SparkContext(conf=conf) # create spark-on-hive context self._sql = SQLContext(self._spark_ctx)
def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration)
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar") #conf.set("spark.driver.extraClassPath", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar") conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-similarity-0.0.7.jar" ) conf.set( "spark.jars", "C:\\Spark\\spark-3.1.1-bin-hadoop2.7\\jars\\scala-udf-JaroWinkler-0.0.1.jar" ) # SET TO YOUR SPARK INSTALATION # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "1g") conf.set("spark.sql.shuffle.partitions", "4") #conf.set("spark.sql.files.maxPartitionBytes","536870912") #conf.set("spark.sql.files.maxPartitionBytes","250000000") #conf.set("spark.sql.files.maxPartitionBytes","134217728") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types ''' spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction( "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType() ) ''' return spark
def main(): conf = SparkConf().setAppName('Home run count').setMaster('local') sc = SparkContext(conf=conf) spark = SparkSession.builder.appName("pyspark sql").getOrCreate() batting_path = "hdfs://localhost:8020/user/baseball/Batting.csv" batting_path="hdfs://localhost:9000/data/Batting.csv" data = spark.read.csv(batting_path, inferSchema=True, header=True) df = data.filter(data.yearID == '2018').select('playerID', 'teamID') # filter players that play on two or more teams players_vertices = df.groupBy("playerID").count().filter("count > 1").select("playerID") edges = df.withColumnRenamed("playerID", "src") edges = edges.withColumnRenamed("teamID", "dst") edges=players_vertices.join(edges, players_vertices.playerID == edges.src, "inner").select("src","dst") players_vertices=players_vertices.withColumnRenamed("playerID","id") teams_vertices = edges.select("dst").distinct().withColumnRenamed("dst","id") vertices = players_vertices.union(teams_vertices) # add one column with auto increasing id vertices = vertices.withColumn('num', monotonically_increasing_id()) graph=GraphFrame(vertices,edges) # motif 1 motif = graph.find("(a)-[]->(b); (a)-[]->(c)").filter("c.num > b.num") calculate(motif) # motif 2 motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c)").filter("c.num > b.num and d.num > a.num") calculate(motif) # motif 3 motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c)").filter( "c.num > b.num and d.num > a.num and e.num > d.num").distinct() calculate(motif) # motif 4 motif = graph.find( "(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c);(f)-[]->(b);(f)-[]->(c)").filter( "c.num > b.num and d.num > a.num and e.num > d.num and f.num > e.num").distinct() calculate(motif) output_path = "/user/Wang/graphframe" # format the output final_result=[] for key in result.keys(): line="" key_split=key.split("_") for i in range(len(key_split)): line += " " + key_split[i] for team in result[key]: line += " " + team final_result.append(line) data = sc.parallelize(final_result) data.saveAsTextFile(output_path)
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) return ssc
def initialize(): global sc, spark, items, inputfile print("Initializing...") sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") jsonread = sc.textFile(inputfile) items = jsonread.map(json.loads)
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def get_spark_context(workers="*", driver_memory=None, executor_memory=None): """ This function sets up a local Spark context, configured for use with SQL Server and AWS S3. """ # we need some libraries (jars) to connect to SQL Server and S3, so define this config jar_dir = r"C:\Jars" files = os.listdir(jar_dir) jars = [f for f in files if f.lower().endswith(".jar")] extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars]) # setup spark context conf = SparkConf().setMaster(f"local[{workers}]") \ .set("spark.driver.extraClassPath", extra_class_path) \ .set("spark.executor.heartbeatInterval", "60s") if driver_memory: conf.set("spark.driver.memory", driver_memory) if executor_memory: conf.set("spark.executor.memory", executor_memory) spark_context = SparkContext(conf=conf) # we need to configure our s3 endpoint because our buckets are in London spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.eu-west-2.amazonaws.com") return spark_context
def initialize(target_partitions=None): """Returns SparkContext and SQLContext.""" conf = SparkConf() extra_settings = { 'spark.serializer': 'org.apache.spark.serializer.KryoSerializer', 'spark.executor.extraJavaOptions': '-XX:+UseG1GC' } if target_partitions: extra_settings['spark.default.parallelism'] = target_partitions conf.setAll(extra_settings.items()) environment = {'PYTHON_EGG_CACHE': '/tmp/python-eggs'} sc = SparkContext(conf=conf, environment=environment) sqlContext = SQLContext(sc) if target_partitions: sqlContext.setConf('spark.sql.shuffle.partitions', target_partitions) jvm_logger = sc._jvm.org.apache.log4j jvm_logger.LogManager.getLogger("org").setLevel(jvm_logger.Level.ERROR) jvm_logger.LogManager.getLogger("akka").setLevel(jvm_logger.Level.ERROR) return sc, sqlContext
def spark_session_setup(request): """ fixture for creating a spark context Args: request: pytest.FixtureRequest object """ conf = ( SparkConf().setMaster("local[*]").setAppName("aida_insights_testing")) spark_context = SparkContext(conf=conf) sc = SparkSession(sparkContext=spark_context).builder.getOrCreate() request.addfinalizer(lambda: sc.stop()) quiet_py4j() return sc
def initialize(): global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') # print(columnName) items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # Getting user and their business count user_business = items.groupByKey().mapValues(set).collect() tuple_edge_list = [] for i in range(0, len(user_business) - 1): for j in range(i + 1, len(user_business)): inter = user_business[i][1] & user_business[j][1] if len(inter) >= filterThreshold: tuple_edge_list.append( (str(user_business[i][0]), str(user_business[j][0]))) tuple_edge_list.append( (str(user_business[j][0]), str(user_business[i][0]))) totalEdges = float(len(tuple_edge_list) / 2) adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues( list).collectAsMap() adjacency_listMain = copy.deepcopy(adjacency_list) totalNodes = list(adjacency_list.keys()) # ------------------------Newly added line------------------------ strict_totalNodes = copy.deepcopy(totalNodes) # print(len(totalNodes)) # ----------------------Part 1--------------------- bfs(totalNodes, adjacency_list) print("Writing Betweenness to File....") # Converting into sorted List Initial Betweenness list_val = list(cost_dict.items()) list_val.sort(key=lambda x: (-x[1], x[0])) writeToFile(list_val) totalNodes = copy.deepcopy(strict_totalNodes) # print(len(totalNodes)) # ----------------------Part 2---------------------- print("Creating Partitions....") create_components(list_val, adjacency_listMain, totalNodes, totalEdges) # ---------------------EoC--------------------------- print("Duration: " + str(time.time() - t))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/" DATA_BUCKET_FOLDER = "/outbrain/orig/" SPARK_TEMP_FOLDER = "/outbrain/spark-temp/" from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.context import SparkContext, SparkConf from pyspark.sql.session import SparkSession conf = SparkConf().setMaster('local[*]').set( 'spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER) sc = SparkContext(conf=conf) spark = SparkSession(sc) print('Loading data...') truncate_day_from_timestamp_udf = F.udf( lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType()) events_schema = StructType([ StructField("display_id", IntegerType(), True), StructField("uuid_event", StringType(), True), StructField("document_id_event", IntegerType(), True), StructField("timestamp_event", IntegerType(), True),
from __future__ import print_function import sys from operator import add from pyspark.sql import SparkSession from pyspark.context import SparkContext, SparkConf if __name__ == "__main__": config = SparkConf().setAppName("wordCount").setMaster("local") sc = SparkContext() lines = sc.textFile("./src/main/python/wordCount/hello.txt") words = lines.flatMap(lambda line: line.split(" ")) wordCountMap = words.map(lambda word: (word, 1)) # count = wordCountMap.reduceByKey(lambda preCount, count: preCount + count) # output = count.collect() # print(output) # use countByKey instead count = wordCountMap.countByKey() print(count)
# # This configuration works for Spark on macOS using homebrew # import os, sys # set OS environment variable os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark' # add Spark library to Python sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python')) # import package import pyspark from pyspark.context import SparkContext, SparkConf import atexit def stop_my_spark(): sc.stop() del(sc) # Register exit atexit.register(stop_my_spark) # Configure and start Spark ... but only once. if not 'sc' in globals(): conf = SparkConf() conf.setAppName('MyFirstSpark') ## you may want to change this conf.setMaster('yarn-client') sc = SparkContext(conf=conf) print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId) print "http://arc.insight.gsu.edu:8088/cluster/app/%s"% (sc.applicationId)
from pyspark.context import SparkContext, SparkConf from pyspark.sql import HiveContext from pyspark.sql.functions import * if __name__ == "__main__": conf = SparkConf().setAppName("flight tripreports") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) flights_data = sqlContext.sql("select * from refined_airlines.flights") non_cancelled_flights = flights_data.filter(flights_data.cancelled == 0) non_cancelled_longer_flights = non_cancelled_flights.filter( non_cancelled_flights.distance >= 1000) grouped_data_count = non_cancelled_longer_flights.groupBy( "flight_number").count() grouped_distance_sum = non_cancelled_longer_flights.groupBy( "flight_number").sum("distance") df1 = grouped_data_count.alias('df1') df2 = grouped_distance_sum.alias('df2') trip_report = df1.join( df2, df1.flight_number == df2.flight_number, how='inner').select('df1.flight_number', col('df1.count').alias("total_trips"), col('df2.sum(distance)').alias("total_distance")) trip_report.createOrReplaceTempView("trip_report_temp")
# # This configuration works for Spark on macOS using homebrew # import os, sys # set OS environment variable os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec' # add Spark library to Python sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python')) # import package import pyspark from pyspark.context import SparkContext, SparkConf import atexit def stop_my_spark(): sc.stop() del(sc) # Register exit atexit.register(stop_my_spark) # Configure and start Spark ... but only once. if not 'sc' in globals(): conf = SparkConf() conf.setAppName('MyFirstSpark') ## you may want to change this conf.setMaster('local[2]') sc = SparkContext(conf=conf) print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
''' @Author: Matheus Barros Date: 23/04/2021 ''' from pyspark.context import SparkContext, SparkConf from pyspark.sql.context import SQLContext from pyspark.sql.session import SparkSession #PARALLELIZING WITH 2 CORES conf = SparkConf().setAppName("rdd basic").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark = SparkSession(sc) #transformation combines values with the same key regularRDD = sc.parallelize([("Messi", 23), ("Ronaldo", 34), ("Neymar", 22), ("Messi", 24)]) pairRDD_reducebykey = regularRDD.reduceByKey(lambda x, y: x + y) reducebykey = pairRDD_reducebykey.collect() print(reducebykey) #operation orders pair RDD by key pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: (x[1], x[0])) pairRDD_reducebykey_rev = pairRDD_reducebykey_rev.sortByKey( ascending=True).collect()
# import os, sys # set OS environment variable os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec' # add Spark library to Python sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python')) # import package import pyspark from pyspark.context import SparkContext, SparkConf import atexit def stop_my_spark(): sc.stop() del (sc) # Register exit atexit.register(stop_my_spark) # Configure and start Spark ... but only once. if not 'sc' in globals(): conf = SparkConf() conf.setAppName('MyFirstSpark') ## you may want to change this conf.setMaster('local[2]') sc = SparkContext(conf=conf) print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)
# Retrieves names from tables def get_table_names(db): temp = sqlContext.sql("show tables from " + db).collect() table_names = [] for i in range(len(temp)): table_names.append(temp[i][1]) return table_names if __name__ == "__main__": conf = SparkConf().setAppName('SCD-Implementation') sc = SparkContext(conf=conf).getOrCreate() sqlContext = HiveContext(sc) temp_table_names = get_table_names("temp_adventureworks") target_table_names = get_table_names("adventureworks") # If the temp table does not exist on target, it is created for i in temp_table_names: if i not in target_table_names: table = sqlContext.sql("select * from temp_adventureworks." + str(i)) col_names = sqlContext.sql("describe temp_adventureworks." +
entry_point = gateway.entry_point imports = entry_point.getPy4JImports() for i in imports: java_import(gateway.jvm, i) context_config =\ ConfigFactory.parse_string(entry_point.contextConfigAsHocon()) job_id = entry_point.jobId() job_env = JobEnvironment(job_id, None, context_config) job_config = ConfigFactory.parse_string(entry_point.jobConfigAsHocon()) job_class = import_class(entry_point.jobClass()) job = job_class() jcontext = entry_point.context() jspark_conf = entry_point.sparkConf() spark_conf = SparkConf(_jconf=jspark_conf) context_class = jcontext.contextType() context = None if context_class == 'org.apache.spark.api.java.JavaSparkContext': context = SparkContext(gateway=gateway, jsc=jcontext, conf=spark_conf) elif context_class == 'org.apache.spark.sql.SQLContext': jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext( jcontext.sparkContext()) sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf) ss = SparkSession(sc, jcontext.sparkSession()) context = SQLContext(sc, ss, jcontext) elif context_class == 'org.apache.spark.sql.hive.HiveContext': jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext( jcontext.sparkContext()) sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf) context = HiveContext(sc, jcontext)
from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql import functions as f from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from awsglue.transforms import * from awsglue.context import GlueContext from awsglue.job import Job import sys args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) conf = SparkConf() conf.set("spark.sql.parquet.compression.codec", "snappy") conf.set("spark.sql.parquet.writeLegacyFormat", "true") sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) input_file_path = "s3://xxxxx"
return html_pages_array # java_path = "C:\Program Files\Java\jre1.8.0_191\binjava.exe" # os.environ['JAVAHOME'] = java_path record_attribute = "WARC-Record-ID" # Here we use a smaller testfile due to computation time. Use the sample.war.gz for real testing. in_file = "C:/Users/klm85310/Documents/WDPS/sample.warc.gz" stanford = 'C:/Users/klm85310/Documents/WDPS/stanford-ner-2017-06-09/stanford-ner-2017-06-09' # Create Spark Context -- Remove this when running on cluster # sc = SparkContext.getOrCreate() conf = SparkConf().setAppName("Entity Recognition").setMaster("local[*]") sc = SparkContext( conf=conf, serializer=PickleSerializer(), # Default serializer # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer batchSize=64) st = StanfordNERTagger(stanford + '/classifiers/english.all.3class.distsim.crf.ser.gz', stanford + '/stanford-ner.jar', encoding='utf-8') rdd_whole_warc_file = rdd = sc.newAPIHadoopFile( in_file, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable",
values = [s for s in line.split(',')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 #print values[0:-1] return LabeledPoint(input[values[-1]], values[0:-1]) if __name__ == '__main__': BASE_DATA_PATH = os.path.abspath( os.path.join(os.path.dirname(__file__), '../..', '../../../data', 'kaggle')) print(BASE_DATA_PATH) conf = (SparkConf().setMaster("local[2]").setAppName("Trial DF Set")) sc = SparkContext(conf=conf) # read data as CSV for Dataframe analysis # /Volumes/work/data/kaggle/ssi.csv # read data n0rmally ''' sqlContext = SQLContext(sc) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv') # summarize(df) print df.show() #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)]))
# /opt/spark/bin/pyspark --master local[1] --jars /opt/symetry/lib/sym-spark-assembly.jar --driver-java-options -Dsym.lic.loc=/opt/symetry/sym.lic # execfile('/Users/mike/rtds/master/RTLM/ScalaProjects/sym-shell/src/com/sml/examples/python/amazonExample.py') import os import sys import pyspark from pyspark.context import SparkContext from pyspark.context import SparkConf from pyspark.sql import SQLContext, HiveContext from pyspark.storagelevel import StorageLevel print("amazonExample.py start") conf = SparkConf() conf.setAppName('amazonExample') sc = SparkContext(conf=conf) gateway = sc._gateway sym = gateway.jvm.com.sml.shell # Find the access keys for EC2. awsAccessKeyId = os.environ['AWS_ACCESS_KEY'] awsSecretAccessKey = os.environ['AWS_SECRET_KEY'] # print("awsAccessKeyId=" + awsAccessKeyId) # print("awsSecretAccessKey=" + awsSecretAccessKey) sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId) sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey) myrdd = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv')
def setUpClass(cls): class_name = cls.__name__ conf = SparkConf().set("spark.default.parallelism", 1) cls.sc = SparkContext(appName=class_name, conf=conf) cls.sc.setCheckpointDir(tempfile.mkdtemp())
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(2) self.setupCalled = True return ssc # Verify that getOrCreate() calls setup() in absence of checkpoint files self.cpd = tempfile.mkdtemp("test_streaming_cps") self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) self.ssc.start() def check_output(n): while not os.listdir(outputd): if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished continue ordd = self.ssc.sparkContext.textFile(p).map( lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) # Verify the getOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(3) # Verify that getOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify the getActiveOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(4) # Verify that getActiveOrCreate() returns active context self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files self.ssc.stop(True, True) shutil.rmtree(self.cpd) # delete checkpoint directory time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) # Stop everything self.ssc.stop(True, True)
return #======================================= if __name__ == "__main__": time1 = time.time() PASS = 20 PASS2 = 20 item_weights = {} conf = SparkConf().setAppName('inf553_hw3_2').setMaster('local[*]') sc = SparkContext(conf=conf) sc.setLogLevel("OFF") train_file = sys.argv[1] val_file = sys.argv[2] case_mark = int(sys.argv[3]) output_file = sys.argv[4] if case_mark == 1: model_based() elif case_mark == 2: user_based() elif case_mark == 3: item_based() else: