def checkSpark(): if request.method == 'POST': try: findspark.find() return "Spark is properly installed" except Exception as e: return (str(e), 400)
def create_spark_session(): """ Create Spark session. """ print("Create Spark session") findspark.init() findspark.find() conf = pyspark.SparkConf().setAppName('appName').setMaster('local') sc = pyspark.SparkContext(conf=conf) spark = SparkSession(sc) return spark
def test_make_class_model_saving(spark_df_class, pandas_factory_fixture): spark, df = spark_df_class path = findspark.find() path = os.path.join(path, 'models') test_path = os.path.join(path, 'test_class_model') test_path = test_path + '_' + str(int(time())) make_class_model(df, spark, test_path, "test_class_model", "test_target", save=True) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("models") session.row_factory = pandas_factory_fixture session.default_fetch_size = None query_latest = "Select Max(timestamp) from models_statistics where model_name = 'test_class_model'" query_path = ( "Select model_path from models_statistics where timestamp = %s and model_name = 'test_class_model'" " ALLOW FILTERING") latest_timestamp = session.execute( query_latest, timeout=None)._current_rows.iloc[0]['system.max(timestamp)'] query_path = query_path % latest_timestamp path = session.execute(query_path, timeout=None)._current_rows.iloc[0]['model_path'] expected_path = test_path actual_path = path assert actual_path == expected_path session.shutdown() cluster.shutdown()
def init_spark(): print("Initializing Spark...") import findspark findspark.init() # uses SPARK_HOME print("Spark found in : ", findspark.find()) import pyspark from pyspark import SparkConf from pyspark.sql import SparkSession # use a unique temp dir for warehouse dir, so we can run multiple spark sessions in one dir import tempfile tmpdir = tempfile.TemporaryDirectory() # print(tmpdir.name) config = ( SparkConf().setAppName("TestApp").setMaster("local[*]").set( 'executor.memory', '2g').set('spark.sql.warehouse.dir', tmpdir.name).set("some_property", "some_value") # another example ) print("Spark config:\n\t", config.toDebugString().replace("\n", "\n\t")) spark = SparkSession.builder.config(conf=config).getOrCreate() print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2]) return spark
def writeToCassandra(stream, checkpoint="checkpoint"): # Tworzenia połączenia spark-cassandra oraz zapisywanie strumienia danych do bazy danych Cassandra checkpointLocation = os.path.join(findspark.find(), checkpoint) checkpointLocation = os.path.join(checkpointLocation, checkpoint) checkpointLocation = checkpointLocation + '_' + str(int(time())) query = stream.writeStream \ .format("org.apache.spark.sql.cassandra") \ .outputMode('append') \ .options(table="predictions", keyspace="predictions", checkpointLocation=checkpointLocation, failOnDataLoss="false") \ .start() #\ #.awaitTermination() return query
def test_make_regr_model_pipeline(spark_df_regr): spark, df = spark_df_regr path = findspark.find() path = os.path.join(path, 'models') test_path = os.path.join(path, 'test_regr_model') test_path = test_path + '_' + str(int(time())) model, _ = make_regr_model(df, spark, test_path, "test_regr_model", "test_target", save=False) actual_stages = str([type(x) for x in model.stages]) expected_stages = "[<class 'pyspark.ml.feature.StringIndexerModel'>," \ " <class 'pyspark.ml.feature.StringIndexerModel'>," \ " <class 'pyspark.ml.feature.OneHotEncoderModel'>," \ " <class 'pyspark.ml.feature.VectorAssembler'>," \ " <class 'pyspark.ml.feature.VectorAssembler'>," \ " <class 'pyspark.ml.feature.StandardScalerModel'>," \ " <class 'pyspark.ml.feature.VectorAssembler'>," \ " <class 'pyspark.ml.regression.RandomForestRegressionModel'>]" assert actual_stages == expected_stages
def test_writeToCassandra(predictions, pandas_factory_fixture): predictions, spark, timestamp = predictions json_path = os.path.join(findspark.find(), "temp_json_test") json_file_path = os.path.join(json_path, "test" + str(int(time()))) predictions.coalesce(1).write.format('json').save(json_file_path) schema = StructType() \ .add("prediction", StringType()) \ .add("model_path", StringType()) \ .add("source_name", StringType()) \ .add("individual", StringType()) \ .add("target_column", StringType()) \ .add("timestamp", IntegerType()) streamingDF = (spark.readStream.schema(schema).option( "maxFilesPerTrigger", 6).json(json_file_path)) connection = writeToCassandra(streamingDF, checkpoint=os.path.join( "test_checkpoint", "test" + str(int(time())))) connection.awaitTermination(5) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("predictions") session.row_factory = pandas_factory_fixture session.default_fetch_size = None query_count = "select count(*) from predictions.predictions where target_column = 'test_target'" \ " and timestamp = %s ALLOW FILTERING;" query_count = query_count % timestamp count = session.execute(query_count, timeout=None)._current_rows.iloc[0]['count'] session.shutdown() cluster.shutdown() assert streamingDF.isStreaming is True assert count > 0
def load_and_train(source): plt = platform.system() if plt == "Linux": findspark.init("/home/smartcity/Downloads/spark-3.0.1-bin-hadoop2.7") path = findspark.find() path = os.path.join(path, 'models') if source == "powietrze": # Załadowanie tabeli powietrze z bazy danych master dataset data_pow, sc_pow = data_for_ml.powietrze_manipulation.load_powietrze() # Wytrenowanie modelu klsyfikacyjnego na wcześniej załadowanych danych powietrze_path = os.path.join(path, 'powietrze_model') powietrze_path = powietrze_path + '_' + str(int(time())) spark_ml.classificator.Classification.make_class_model( data_pow, sc_pow, powietrze_path, 'RF_pow', 'pm25') elif source == "urzedy": data_urz, sc_urz = data_for_ml.urzedy_manipulation.load_urzedy( agg="moving_average") urzedy_path = os.path.join(path, 'urzedy_model') urzedy_path = urzedy_path + '_' + str(int(time())) spark_ml.reggresor.Regression.make_regr_model(data_urz, sc_urz, urzedy_path, 'RF_urz_mav', "liczbaKlwKolejce") elif source == "velib": data_vel, sc_vel = data_for_ml.velib_manipulation.load_velib() velib_path = os.path.join(path, 'velib_model') velib_path = velib_path + '_' + str(int(time())) spark_ml.reggresor.Regression.make_regr_model(data_vel, sc_vel, velib_path, 'RF_vel', 'numbikesavailable')
#!/usr/bin/env python # coding: utf-8 # Import libraries required to find pyspark installations on jupyter notebook import findspark findspark.init() findspark.find() import pyspark # Import the required pyspark libraries that will help us explode the list of checkin dates for each business into individual rows from pyspark.sql.functions import col, split from pyspark.sql.functions import explode # Import SparkSession from Pyspark from pyspark.sql import SparkSession import json import csv import re import string import pandas as pd # Create spark object with the necessary configuration spark = SparkSession.builder.appName('EDA').master('local').enableHiveSupport().getOrCreate() # Read all the input files (stored on HDFS in JSON format) and create a spark dataframe on top of it review = spark.read.json('hdfs://0.0.0.0:19000/yelp/review.json') business = spark.read.json('hdfs://0.0.0.0:19000/yelp/business.json')
import findspark from pyspark.sql import SparkSession from pyspark.ml.regression import LinearRegression from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.ml.linalg import DenseVector from pyspark.ml.feature import StandardScaler x = findspark.find() print(x) findspark.init("C:/spark/spark-2.3.0-bin-hadoop2.7") spark = SparkSession.builder.master("local").appName( "Linear Regression Model").config("spark.executor.memory", "1gb").getOrCreate() sc = spark.sparkContext rdd = sc.textFile( 'C:/Users/Emad Ahmed/Desktop/CaliforniaHousing/cal_housing.data') header = sc.textFile( 'C:/Users/Emad Ahmed/Desktop/CaliforniaHousing/cal_housing.domain') header.collect() rdd = rdd.map(lambda line: line.split(",")) # RDD -> DF df = rdd.map(lambda line: Row(longitude=line[0], latitude=line[1], housingMedianAge=line[2], totalRooms=line[3], totalBedRooms=line[4],
import findspark import os import sys findspark.init() spark_home = findspark.find() #spark_home = os.environ.get('SPARK_HOME', None) sys.path.insert(0, spark_home + "/python") # Add the py4j to the path. # You may need to change the version number to match your install sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip')) # Adding the library to mysql connector packages = "mysql:mysql-connector-java:5.1.37" os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages {0} pyspark-shell".format( packages ) # Initialize PySpark to predefine the SparkContext variable 'sc' execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
def start(): global __pythonManager, __gateway, __dependency_manager, __gmql_jar_path, __py4j_path logger = logging.getLogger() master = get_master() if master.lower().startswith('local'): logger.debug("Starting LOCAL backend (master: {})".format( master.lower())) java_home = os.environ.get("JAVA_HOME") if java_home is None: raise SystemError("The environment variable JAVA_HOME is not set") java_path = os.path.join(java_home, "bin", "java") _port = launch_gateway(classpath=__gmql_jar_path, die_on_exit=True, java_path=java_path, javaopts=get_local_java_options(), jarpath=__py4j_path) __gateway = JavaGateway(gateway_parameters=GatewayParameters( port=_port, auto_convert=True)) python_api_package = get_python_api_package(__gateway) __pythonManager = start_gmql_manager(python_api_package) conf = get_configuration() conf.set_master(master.lower()) _set_spark_configuration(conf) _set_system_configuration(conf) else: # use spark-submit logger.debug("Submitting backend to {}".format(master)) master = re.sub("^spark_", "", master.lower()) configs = get_spark_configs() spark_location = find() logger.debug("Found spark at location: {}".format(spark_location)) command = [ os.path.join(spark_location, 'bin', 'spark-submit'), '--master', master, '--deploy-mode', "client" ] for cname, c in configs.items(): command.extend(['--conf', '{}={}'.format(cname, c)]) command.append(__gmql_jar_path) stderr = open(os.devnull, "w") proc = Popen(command, stdout=PIPE, stdin=PIPE, stderr=stderr) while True: try: _port = int(proc.stdout.readline()) break except ValueError: pass logger.debug("Backend listening at port {}".format(_port)) redirect_stdout = open(os.devnull, "w") OutputConsumer(redirect_stdout, proc.stdout, daemon=True).start() ProcessConsumer(proc, [redirect_stdout], daemon=True).start() quiet_close(stderr) __gateway = JavaGateway(gateway_parameters=GatewayParameters( port=_port, auto_convert=True)) pm = __gateway.entry_point.getPythonManager() pm.startEngine() __pythonManager = pm
sql_query = 'select * from db_pruebas..Consumer_Complaints_2' df = sql_class.select_to_df(sql_query) print(df.head()) print(df.columns) ###USING SPARK from pyspark import SparkContext, SparkConf, SQLContext ruta_json = 'spark_entornos.json' spark_version = 'spark-2.4.4' set_enviroment(ruta_json, spark_version) import findspark findspark.init() findspark.find() import pyspark print(findspark.find()) appName = "PySpark SQL Server Example - via ODBC" master = "local" conf = SparkConf() \ .setAppName(appName) \ .setMaster(master) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark = sqlContext.sparkSession sparkDF = spark.createDataFrame(df) sparkDF.show()
import findspark import annealing_spark import numpy as np import matplotlib.pyplot as plt import networkx as nx import json from networkx.readwrite import json_graph findspark.find() findspark.init('/usr/local/opt/apache-spark/libexec') import pyspark import influence_function sc = pyspark.SparkContext() with open("graph/nc_mini.json", "r") as graph_data: graph_data = json.load(graph_data) NC_digraph = json_graph.node_link_graph(graph_data) nodes_set = NC_digraph.nodes() ###################################################################################### # #Greedy Algorithm Implementation # ###################################################################################### def getMaxGreedy_2(nodes_set, N, curr_nodes): result = [] max_node = None max_influence = 0 for i in nodes_set:
import findspark import os import sys findspark.init() spark_home = findspark.find() #spark_home = os.environ.get('SPARK_HOME', None) sys.path.insert(0, spark_home + "/python") # Add the py4j to the path. # You may need to change the version number to match your install sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip')) # Adding the library to mysql connector packages = "mysql:mysql-connector-java:5.1.37" os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages {0} pyspark-shell".format( packages) # Initialize PySpark to predefine the SparkContext variable 'sc' execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))