Exemplo n.º 1
0
def checkSpark():
    if request.method == 'POST':
        try:
            findspark.find()
            return "Spark is properly installed"
        except Exception as e:
            return (str(e), 400)
Exemplo n.º 2
0
def create_spark_session():
    """
    Create Spark session.
    """
    print("Create Spark session")
    findspark.init()
    findspark.find()

    conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
    sc = pyspark.SparkContext(conf=conf)
    spark = SparkSession(sc)

    return spark
Exemplo n.º 3
0
def test_make_class_model_saving(spark_df_class, pandas_factory_fixture):
    spark, df = spark_df_class
    path = findspark.find()
    path = os.path.join(path, 'models')
    test_path = os.path.join(path, 'test_class_model')
    test_path = test_path + '_' + str(int(time()))
    make_class_model(df,
                     spark,
                     test_path,
                     "test_class_model",
                     "test_target",
                     save=True)
    cluster = Cluster(['127.0.0.1'], "9042")
    session = cluster.connect("models")
    session.row_factory = pandas_factory_fixture
    session.default_fetch_size = None
    query_latest = "Select Max(timestamp) from models_statistics where model_name = 'test_class_model'"
    query_path = (
        "Select model_path from models_statistics where timestamp = %s and model_name = 'test_class_model'"
        " ALLOW FILTERING")
    latest_timestamp = session.execute(
        query_latest,
        timeout=None)._current_rows.iloc[0]['system.max(timestamp)']
    query_path = query_path % latest_timestamp
    path = session.execute(query_path,
                           timeout=None)._current_rows.iloc[0]['model_path']

    expected_path = test_path
    actual_path = path
    assert actual_path == expected_path
    session.shutdown()
    cluster.shutdown()
Exemplo n.º 4
0
def init_spark():
    print("Initializing Spark...")
    import findspark
    findspark.init()  # uses SPARK_HOME
    print("Spark found in : ", findspark.find())

    import pyspark
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    # use a unique temp dir for warehouse dir, so we can run multiple spark sessions in one dir
    import tempfile
    tmpdir = tempfile.TemporaryDirectory()
    # print(tmpdir.name)

    config = (
        SparkConf().setAppName("TestApp").setMaster("local[*]").set(
            'executor.memory',
            '2g').set('spark.sql.warehouse.dir',
                      tmpdir.name).set("some_property",
                                       "some_value")  # another example
    )

    print("Spark config:\n\t", config.toDebugString().replace("\n", "\n\t"))
    spark = SparkSession.builder.config(conf=config).getOrCreate()
    print('Spark UI running on port ' +
          spark.sparkContext.uiWebUrl.split(':')[2])

    return spark
Exemplo n.º 5
0
def writeToCassandra(stream, checkpoint="checkpoint"):
    # Tworzenia połączenia spark-cassandra oraz zapisywanie strumienia danych do bazy danych Cassandra

    checkpointLocation = os.path.join(findspark.find(), checkpoint)
    checkpointLocation = os.path.join(checkpointLocation, checkpoint)
    checkpointLocation = checkpointLocation + '_' + str(int(time()))

    query = stream.writeStream \
        .format("org.apache.spark.sql.cassandra") \
        .outputMode('append') \
        .options(table="predictions", keyspace="predictions", checkpointLocation=checkpointLocation,
                 failOnDataLoss="false") \
        .start() #\
    #.awaitTermination()

    return query
Exemplo n.º 6
0
def test_make_regr_model_pipeline(spark_df_regr):
    spark, df = spark_df_regr
    path = findspark.find()
    path = os.path.join(path, 'models')
    test_path = os.path.join(path, 'test_regr_model')
    test_path = test_path + '_' + str(int(time()))
    model, _ = make_regr_model(df, spark, test_path, "test_regr_model", "test_target",
                                                              save=False)
    actual_stages = str([type(x) for x in model.stages])
    expected_stages = "[<class 'pyspark.ml.feature.StringIndexerModel'>," \
                      " <class 'pyspark.ml.feature.StringIndexerModel'>," \
                      " <class 'pyspark.ml.feature.OneHotEncoderModel'>," \
                      " <class 'pyspark.ml.feature.VectorAssembler'>," \
                      " <class 'pyspark.ml.feature.VectorAssembler'>," \
                      " <class 'pyspark.ml.feature.StandardScalerModel'>," \
                      " <class 'pyspark.ml.feature.VectorAssembler'>," \
                      " <class 'pyspark.ml.regression.RandomForestRegressionModel'>]"
    assert actual_stages == expected_stages
def test_writeToCassandra(predictions, pandas_factory_fixture):

    predictions, spark, timestamp = predictions
    json_path = os.path.join(findspark.find(), "temp_json_test")
    json_file_path = os.path.join(json_path, "test" + str(int(time())))
    predictions.coalesce(1).write.format('json').save(json_file_path)
    schema = StructType() \
            .add("prediction", StringType()) \
            .add("model_path", StringType()) \
            .add("source_name", StringType()) \
            .add("individual", StringType()) \
            .add("target_column", StringType()) \
            .add("timestamp", IntegerType())

    streamingDF = (spark.readStream.schema(schema).option(
        "maxFilesPerTrigger", 6).json(json_file_path))

    connection = writeToCassandra(streamingDF,
                                  checkpoint=os.path.join(
                                      "test_checkpoint",
                                      "test" + str(int(time()))))
    connection.awaitTermination(5)

    cluster = Cluster(['127.0.0.1'], "9042")
    session = cluster.connect("predictions")
    session.row_factory = pandas_factory_fixture
    session.default_fetch_size = None
    query_count = "select count(*) from predictions.predictions where target_column = 'test_target'" \
                   " and timestamp = %s ALLOW FILTERING;"
    query_count = query_count % timestamp
    count = session.execute(query_count,
                            timeout=None)._current_rows.iloc[0]['count']
    session.shutdown()
    cluster.shutdown()

    assert streamingDF.isStreaming is True
    assert count > 0
Exemplo n.º 8
0
def load_and_train(source):
    plt = platform.system()
    if plt == "Linux":
        findspark.init("/home/smartcity/Downloads/spark-3.0.1-bin-hadoop2.7")
    path = findspark.find()
    path = os.path.join(path, 'models')

    if source == "powietrze":
        # Załadowanie tabeli powietrze z bazy danych master dataset
        data_pow, sc_pow = data_for_ml.powietrze_manipulation.load_powietrze()

        # Wytrenowanie modelu klsyfikacyjnego na wcześniej załadowanych danych

        powietrze_path = os.path.join(path, 'powietrze_model')
        powietrze_path = powietrze_path + '_' + str(int(time()))
        spark_ml.classificator.Classification.make_class_model(
            data_pow, sc_pow, powietrze_path, 'RF_pow', 'pm25')

    elif source == "urzedy":
        data_urz, sc_urz = data_for_ml.urzedy_manipulation.load_urzedy(
            agg="moving_average")

        urzedy_path = os.path.join(path, 'urzedy_model')
        urzedy_path = urzedy_path + '_' + str(int(time()))
        spark_ml.reggresor.Regression.make_regr_model(data_urz, sc_urz,
                                                      urzedy_path,
                                                      'RF_urz_mav',
                                                      "liczbaKlwKolejce")

    elif source == "velib":
        data_vel, sc_vel = data_for_ml.velib_manipulation.load_velib()

        velib_path = os.path.join(path, 'velib_model')
        velib_path = velib_path + '_' + str(int(time()))
        spark_ml.reggresor.Regression.make_regr_model(data_vel, sc_vel,
                                                      velib_path, 'RF_vel',
                                                      'numbikesavailable')
#!/usr/bin/env python
# coding: utf-8


# Import libraries required to find pyspark installations on jupyter notebook
import findspark
findspark.init()
findspark.find()
import pyspark

# Import the required pyspark libraries that will help us explode the list of checkin dates for each business into individual rows 
from pyspark.sql.functions import col, split
from pyspark.sql.functions import explode


# Import SparkSession from Pyspark
from pyspark.sql import SparkSession

import json
import csv
import re
import string
import pandas as pd

# Create spark object with the necessary configuration
spark = SparkSession.builder.appName('EDA').master('local').enableHiveSupport().getOrCreate()


# Read all the input files (stored on HDFS in JSON format) and create a spark dataframe on top of it
review = spark.read.json('hdfs://0.0.0.0:19000/yelp/review.json')
business = spark.read.json('hdfs://0.0.0.0:19000/yelp/business.json')
Exemplo n.º 10
0
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler

x = findspark.find()
print(x)
findspark.init("C:/spark/spark-2.3.0-bin-hadoop2.7")
spark = SparkSession.builder.master("local").appName(
    "Linear Regression Model").config("spark.executor.memory",
                                      "1gb").getOrCreate()
sc = spark.sparkContext

rdd = sc.textFile(
    'C:/Users/Emad Ahmed/Desktop/CaliforniaHousing/cal_housing.data')
header = sc.textFile(
    'C:/Users/Emad Ahmed/Desktop/CaliforniaHousing/cal_housing.domain')
header.collect()

rdd = rdd.map(lambda line: line.split(","))

# RDD -> DF
df = rdd.map(lambda line: Row(longitude=line[0],
                              latitude=line[1],
                              housingMedianAge=line[2],
                              totalRooms=line[3],
                              totalBedRooms=line[4],
Exemplo n.º 11
0
import findspark
import os
import sys
findspark.init()
spark_home = findspark.find()

#spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")

# Add the py4j to the path.
# You may need to change the version number to match your install
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

# Adding the library to mysql connector
packages = "mysql:mysql-connector-java:5.1.37"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages {0} pyspark-shell".format(
    packages
)

# Initialize PySpark to predefine the SparkContext variable 'sc'
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
Exemplo n.º 12
0
def start():
    global __pythonManager, __gateway, __dependency_manager, __gmql_jar_path, __py4j_path
    logger = logging.getLogger()
    master = get_master()

    if master.lower().startswith('local'):
        logger.debug("Starting LOCAL backend (master: {})".format(
            master.lower()))
        java_home = os.environ.get("JAVA_HOME")
        if java_home is None:
            raise SystemError("The environment variable JAVA_HOME is not set")
        java_path = os.path.join(java_home, "bin", "java")
        _port = launch_gateway(classpath=__gmql_jar_path,
                               die_on_exit=True,
                               java_path=java_path,
                               javaopts=get_local_java_options(),
                               jarpath=__py4j_path)
        __gateway = JavaGateway(gateway_parameters=GatewayParameters(
            port=_port, auto_convert=True))
        python_api_package = get_python_api_package(__gateway)
        __pythonManager = start_gmql_manager(python_api_package)

        conf = get_configuration()
        conf.set_master(master.lower())
        _set_spark_configuration(conf)
        _set_system_configuration(conf)
    else:
        # use spark-submit
        logger.debug("Submitting backend to {}".format(master))
        master = re.sub("^spark_", "", master.lower())
        configs = get_spark_configs()
        spark_location = find()
        logger.debug("Found spark at location: {}".format(spark_location))
        command = [
            os.path.join(spark_location, 'bin', 'spark-submit'), '--master',
            master, '--deploy-mode', "client"
        ]

        for cname, c in configs.items():
            command.extend(['--conf', '{}={}'.format(cname, c)])

        command.append(__gmql_jar_path)

        stderr = open(os.devnull, "w")
        proc = Popen(command, stdout=PIPE, stdin=PIPE, stderr=stderr)

        while True:
            try:
                _port = int(proc.stdout.readline())
                break
            except ValueError:
                pass

        logger.debug("Backend listening at port {}".format(_port))
        redirect_stdout = open(os.devnull, "w")
        OutputConsumer(redirect_stdout, proc.stdout, daemon=True).start()
        ProcessConsumer(proc, [redirect_stdout], daemon=True).start()
        quiet_close(stderr)

        __gateway = JavaGateway(gateway_parameters=GatewayParameters(
            port=_port, auto_convert=True))
        pm = __gateway.entry_point.getPythonManager()
        pm.startEngine()
        __pythonManager = pm
    sql_query = 'select * from db_pruebas..Consumer_Complaints_2'

    df = sql_class.select_to_df(sql_query)

    print(df.head())
    print(df.columns)

    ###USING SPARK
    from pyspark import SparkContext, SparkConf, SQLContext

    ruta_json = 'spark_entornos.json'
    spark_version = 'spark-2.4.4'
    set_enviroment(ruta_json, spark_version)
    import findspark
    findspark.init()
    findspark.find()
    import pyspark
    print(findspark.find())

    appName = "PySpark SQL Server Example - via ODBC"
    master = "local"
    conf = SparkConf() \
        .setAppName(appName) \
        .setMaster(master)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession

    sparkDF = spark.createDataFrame(df)
    sparkDF.show()
import findspark
import annealing_spark
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
from networkx.readwrite import json_graph
findspark.find()
findspark.init('/usr/local/opt/apache-spark/libexec')
import pyspark
import influence_function

sc = pyspark.SparkContext()

with open("graph/nc_mini.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)

nodes_set = NC_digraph.nodes()

######################################################################################
#
#Greedy Algorithm Implementation
#
######################################################################################

def getMaxGreedy_2(nodes_set, N, curr_nodes):
    result = []
    max_node = None
    max_influence = 0
    for i in nodes_set:
Exemplo n.º 15
0
import findspark
import os
import sys
findspark.init()
spark_home = findspark.find()

#spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")

# Add the py4j to the path.
# You may need to change the version number to match your install
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

# Adding the library to mysql connector
packages = "mysql:mysql-connector-java:5.1.37"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages {0} pyspark-shell".format(
    packages)

# Initialize PySpark to predefine the SparkContext variable 'sc'
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))