def get_spark_context(workers="*", driver_memory=None, executor_memory=None): """ This function sets up a local Spark context, configured for use with SQL Server and AWS S3. """ # we need some libraries (jars) to connect to SQL Server and S3, so define this config jar_dir = r"C:\Jars" files = os.listdir(jar_dir) jars = [f for f in files if f.lower().endswith(".jar")] extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars]) # setup spark context conf = SparkConf().setMaster(f"local[{workers}]") \ .set("spark.driver.extraClassPath", extra_class_path) \ .set("spark.executor.heartbeatInterval", "60s") if driver_memory: conf.set("spark.driver.memory", driver_memory) if executor_memory: conf.set("spark.executor.memory", executor_memory) spark_context = SparkContext(conf=conf) # we need to configure our s3 endpoint because our buckets are in London spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.eu-west-2.amazonaws.com") return spark_context
def __init__(self): from py4j.java_gateway import java_import """When running a Python script from Scala - this function is called by the script to initialize the connection to the Java Gateway and get the spark context. code is basically copied from: https://github.com/apache/zeppelin/blob/master/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py#L30 """ if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) gateway = JavaGateway( GatewayClient(port=int(os.environ.get("PYSPARK_GATEWAY_PORT"))), auto_convert=True) java_import(gateway.jvm, "org.apache.spark.SparkEnv") java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") intp = gateway.entry_point jSparkSession = intp.pyGetSparkSession() jsc = intp.pyGetJSparkContext(jSparkSession) jconf = intp.pyGetSparkConf(jsc) conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) self.sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf) # Spark 2 self.sparkSession = SparkSession(self.sc, jSparkSession) self.sqlContext = self.sparkSession._wrapped
def _get_or_create_context(): global _sc if not _sc: from pyspark.conf import SparkConf from pyspark.context import SparkContext settings = GlobalPreferences['spark_configuration'] SparkContext.setSystemProperty("spark.executor.memory",settings['executor-memory']) SparkContext.setSystemProperty("spark.driver.memory",settings['driver-memory']) conf = SparkConf() conf = conf.setMaster(settings['master']) conf = conf.set('spark.driver.maxResultSize',settings['driver-memory']) _sc = SparkContext.getOrCreate(conf=conf) _sc.setLogLevel("WARN") return _sc
This file is designed to be launched as a PYTHONSTARTUP script. """ import atexit import os import platform import py4j import pyspark from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext from pyspark.storagelevel import StorageLevel if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.getOrCreate() except TypeError: spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext
# try again if port unavailable if check == notfound: port += 1 # return the first available port return port # this is the deprecated equivalent of ADD_JARS add_files = None if os.environ.get("ADD_FILES") is not None: add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) # setup mesos-based connection conf = (SparkConf() .setMaster(os.environ["SPARK_MASTER"])) # set the UI port conf.set("spark.ui.port", ui_get_available_port()) # configure docker containers as executors conf.setSparkHome(os.environ.get("SPARK_HOME")) conf.set("spark.mesos.executor.docker.image", "lab41/spark-mesos-dockerworker-ipython") conf.set("spark.mesos.executor.home", "/usr/local/spark-1.4.1-bin-hadoop2.4") conf.set("spark.executorEnv.MESOS_NATIVE_LIBRARY", "/usr/local/lib/libmesos.so") conf.set("spark.network.timeout", "100")
def main(): # Instantiate SparkConf and sent extraJavaOptions to both executors and drivers spark_conf = (SparkConf().set( 'spark.executor.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true').set( 'spark.driver.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true')) # Instantiate SparkContext based on SparkConf sc = SparkContext(conf=spark_conf) # Set enableV4 property to access S3 input data sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true') # Create new Hadoop Configuration hadoopConf = sc._jsc.hadoopConfiguration() # Set Hadoop configuration K-V if is_not_blank(AWS_ACCESS_KEY_ID): hadoopConf.set('fs.s3a.awsAccessKeyId', AWS_ACCESS_KEY_ID) if is_not_blank(AWS_SECRET_ACCESS_KEY): hadoopConf.set('fs.s3a.awsSecretAccessKey', AWS_SECRET_ACCESS_KEY) hadoopConf.set('com.amazonaws.services.s3a.enableV4', 'true') hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') # Create SparkSession from SparkContext spark_session = ( SparkSession(sc).builder.appName('ComplaintClassificator').config( conf=spark_conf).getOrCreate()) # Timestamp of start start_timestamp = dt.now() # Instantiate SparkContext sc = spark_session.sparkContext # Instantiate SQLContext sql_ctx = SQLContext(sc) # Set log level to 'WARN' sc.setLogLevel('WARN') # Set up log4j logging log4j_logger = sc._jvm.org.apache.log4j logger = log4j_logger.LogManager.getLogger(__name__) # Create schema as a StructType of StructField(s) schema = StructType([ StructField('ReceivedDate', StringType(), True), StructField('Product', StringType(), True), StructField('Subproduct', StringType(), True), StructField('Issue', StringType(), True), StructField('Subissue', StringType(), True), StructField('ConsumerComplaintNarrative', StringType(), True), StructField('CompanyPublicResponse', StringType(), True), StructField('CompanyName', StringType(), True), StructField('State', StringType(), True), StructField('ZipCode', IntegerType(), True), StructField('Tags', StringType(), True), StructField('IsConsumerConsent', StringType(), True), StructField('SubmittedVia', StringType(), True), StructField('SentDate', StringType(), True), StructField('CompanyResponseToConsument', StringType(), True), StructField('IsTimelyResponse', StringType(), True), StructField('IsConsumerDisputed', StringType(), True), StructField('ComplaintId', IntegerType(), True) ]) logger.warn("Starting preprocessing and data cleansing...") # Read Consumer_Complaints.csv file and apply schema complaint_df = (spark_session.read.format('csv').option( 'header', 'true').option('delimiter', ',').option('mode', 'FAILFAST').option( 'parserLib', 'univocity').option('escape', '"').option( 'multiLine', 'true').option('inferSchema', 'false').schema( schema).load(CONSUMER_COMPLAINTS).alias('complaint_df')) logger.warn("Explaining complaint_df...") complaint_df.explain() logger.warn("complaint_df has %d records, %d columns." % (complaint_df.count(), len(complaint_df.columns))) logger.warn("Printing schema of complaint_df: ") complaint_df.printSchema() # Register cleanse_files function as an UDF (UserDefinedFunction) udf_cleansed_field = udf(cleanse_field, StringType()) # Provide a lambda function to format date-type field to 'YYYY-MM-DD' pattern change_data_format = udf(lambda x: dt.strptime(x, '%m/%d/%Y'), DateType()) # Do some clean-up activities cleansed_df = (complaint_df.withColumn( 'Issue', udf_cleansed_field( complaint_df['ConsumerComplaintNarrative'])).withColumn( 'ReceivedDate', change_data_format(complaint_df['ReceivedDate']))) logger.warn("Explaining cleansed_df...") cleansed_df.explain() logger.warn("cleansed_init_df has %d records, %d columns." % (cleansed_df.count(), len(cleansed_df.columns))) logger.warn("Printing schema of cleansed_df: ") cleansed_df.printSchema() # Reduce a number of fields and filter non-null values out on consumer complaint narratives final_complaints_df = (cleansed_df.where( cleansed_df['ConsumerComplaintNarrative'].isNotNull()).select( 'ComplaintId', 'ReceivedDate', 'State', 'Product', 'ConsumerComplaintNarrative', 'Issue').orderBy(cleansed_df['ReceivedDate'])) final_complaints_df.registerTempTable("final_complaints_df") # Check random ConsumerComplaintNarrative as well as Issue content sql_ctx.sql(""" SELECT RowNum, ConsumerComplaintNarrative, Issue FROM (SELECT ROW_NUMBER() OVER (PARTITION BY State ORDER BY ReceivedDate DESC) AS RowNum, ConsumerComplaintNarrative, Issue, ReceivedDate, State FROM final_complaints_df) fc WHERE RowNum = 1 LIMIT 10 """).show() logger.warn("Explaining final_complaints_df...") final_complaints_df.explain() logger.warn( "final_complaints has %d records, %d columns." % (final_complaints_df.count(), len(final_complaints_df.columns))) logger.warn("Printing schema of final_complaints_df: ") final_complaints_df.printSchema() # Read states json provider as a states_df DataFrame abstraction states_df = (spark_session.read.json(AMERICAN_STATES, multiLine=True).alias('states_df')) logger.warn("Explaining states_df...") states_df.explain() logger.warn("states_df has %d records, %d columns." % (states_df.count(), len(states_df.columns))) logger.warn("Printing schema of states_df: ") states_df.printSchema() # List of fields to drop (not needed for the further processing) drop_list = ['state', 'abbreviation'] # Join complaints data with American states, apply id field and drop unnecessary fields joined_df = (final_complaints_df.join( broadcast(states_df), col('complaint_df.State') == col('states_df.abbreviation'), "left").withColumnRenamed('ConsumerComplaintNarrative', 'ConsumerComplaint').withColumn( 'RowNoIndex', monotonically_increasing_id()).select( 'Product', 'ConsumerComplaint', 'name').drop(*drop_list)) joined_df.registerTempTable("joined_df") # Check random FullStateName content sql_ctx.sql( """ SELECT RowNum, Product, ConsumerComplaint, FullStateName FROM (SELECT ROW_NUMBER() OVER (PARTITION BY Product ORDER BY ConsumerComplaint DESC) AS RowNum, Product, ConsumerComplaint, name AS FullStateName FROM joined_df) jd WHERE RowNum = 1 LIMIT 10 """).show() logger.warn("Explaining joined_df...") joined_df.explain() logger.warn("joined_df has %d records, %d columns." % (joined_df.count(), len(joined_df.columns))) logger.warn("Printing schema of joined_df: ") joined_df.printSchema() # Check unique labels of Product attribute before replace joined_df.select('Product').distinct().show() # Replace redundant labels from Product field renamed_df = (joined_df.withColumn( 'Product', regexp_replace( 'Product', "Credit reporting, credit repair services, or other personal consumer reports", "Credit reporting, repair, or other") ).withColumn( 'Product', regexp_replace("Product", "Virtual currency", "Money transfer, virtual currency, or money service") ).withColumn( 'Product', regexp_replace( "Product", "Money transfer", "Money transfer, virtual currency, or money service")).withColumn( 'Product', regexp_replace( "Product", "Payday loan", "Payday loan, title loan, or personal loan")).withColumn( 'Product', regexp_replace( "Product", "Credit reporting", "Credit reporting, repair, or other")).withColumn( 'Product', regexp_replace( "Product", "Prepaid card", "Credit card or prepaid card")).withColumn( 'Product', regexp_replace( "Product", "Credit card", "Credit card or prepaid card"))) renamed_df.registerTempTable("renamed_df") # Check how many unique labels (classes) there are sql_ctx.sql(""" SELECT DISTINCT Product FROM renamed_df """).show() # Check how many times each class occurs in the corpus sql_ctx.sql(""" SELECT Product, count(*) FROM renamed_df GROUP BY Product ORDER BY count(*) DESC""").show(50, False) logger.warn("Explaining renamed_df...") renamed_df.explain() # Check unique labels of Product attribute after replace renamed_df.select('Product').distinct().show() # Check amount of unique labels of Product attribute after replace logger.warn(str(renamed_df.select('Product').distinct().count())) logger.warn("Starting feature extraction...") # Tokenize consumer complaints sentences tokenizer = Tokenizer(inputCol='ConsumerComplaint', outputCol='Words') # Remove stop words remover = StopWordsRemover(inputCol='Words', outputCol='FilteredWords') # num_features = 700 hashing_tf = HashingTF(inputCol='FilteredWords', outputCol='RawFeatures') # minDocFreq: minimum number of documents in which a term should appear for filtering idf = IDF(inputCol='RawFeatures', outputCol='features') # Instantiate StringIndexer product_indexer = StringIndexer(inputCol='Product', outputCol='label') # Create a pipeline from previously defined feature extraction stages pipeline = Pipeline( stages=[tokenizer, remover, hashing_tf, idf, product_indexer]) # Fit renamed_df to the pipeline pipeline_fit = pipeline.fit(renamed_df) # Transform pipeline_fit data_set = pipeline_fit.transform(renamed_df) # Randomly slice the data into training and test datasets with requested ratio (training_data, test_data) = data_set.randomSplit([0.7, 0.3], seed=100) # Cache training_data training_data.cache() logger.warn("Starting Naive-Bayes...") # Naive-Bayes nb = NaiveBayes(labelCol='label', featuresCol='features', modelType='multinomial') # Create a model without Cross Validation nb_model = nb.fit(training_data) # Make predictions on model without Cross Validation predictions = nb_model.transform(test_data) print("NaiveBayes without CV model type: ", nb.getModelType()) print("NaiveBayes without CV smoothing factor: ", str(nb.getSmoothing())) # NB without CV metrics nb_metrics_rdd = MulticlassMetrics(predictions['label', 'prediction'].rdd) # NB stats by each class (label) labels = predictions.rdd.map(lambda cols: cols.label).distinct().collect() logger.warn("Printing NB stats...") for label in sorted(labels): try: print("Class %s precision = %s" % (label, nb_metrics_rdd.precision(label))) print("Class %s recall = %s" % (label, nb_metrics_rdd.recall(label))) print("Class %s F1 Measure = %s" % (label, nb_metrics_rdd.fMeasure(label, beta=1.0))) except Py4JJavaError: pass # Weighted stats print("Weighted recall = %s" % nb_metrics_rdd.weightedRecall) print("Weighted precision = %s" % nb_metrics_rdd.weightedPrecision) print("Weighted F(1) Score = %s" % nb_metrics_rdd.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % nb_metrics_rdd.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % nb_metrics_rdd.weightedFalsePositiveRate) # Show 10 results of predictions that haven't been predicted successfully predictions.filter(predictions['prediction'] != predictions['label']) \ .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \ .orderBy("probability", ascending=False) \ .show(n=10, truncate=20) # Show 10 results of predictions that have been predicted successfully predictions.filter(predictions['prediction'] == predictions['label']) \ .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \ .orderBy("probability", ascending=False) \ .show(n=10, truncate=20) # Instantiate an evaluation of predictions without Cross Validation evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") # Evaluate best model without an use of Cross Validation accuracy_without_cv = evaluator.evaluate(predictions) print("Naive-Bayes accuracy without Cross Validation = %s (metric)" % str(nb_metrics_rdd.accuracy)) logger.warn("Starting Cross Validation...") # Instantiate ParamGridBuilder for the Cross Validation purpose nbp_params_grid = (ParamGridBuilder().addGrid( nb.smoothing, [0.8, 0.9, 1.0]).addGrid(hashing_tf.numFeatures, [700, 720]).addGrid(idf.minDocFreq, [3, 4, 5]).build()) # Instantiate the Evaluator of the model nb_evaluator = MulticlassClassificationEvaluator( labelCol='label', predictionCol='prediction') # Instantiate 5-fold CrossValidator nb_cv = CrossValidator(estimator=nb, estimatorParamMaps=nbp_params_grid, evaluator=nb_evaluator, numFolds=5) # Create a model with Cross Validation nb_cv_model = nb_cv.fit(training_data) # Make predictions on model with Cross Validation cv_predictions = nb_cv_model.transform(training_data) # Evaluate best model with an use of Cross Validation accuracy_with_cv = nb_evaluator.evaluate(cv_predictions) print("Naive-Bayes accuracy with Cross Validation:", str(accuracy_with_cv)) print( "Improvement for the best fitted model (NB with CV) in regard of NB: ", str(accuracy_with_cv - nb_metrics_rdd.accuracy)) # NB with CV metrics nb_with_cv_metrics_rdd = MulticlassMetrics( cv_predictions['label', 'prediction'].rdd) # NB with CV stats by each class (label) labels = cv_predictions.rdd.map(lambda att: att.label).distinct().collect() logger.warn("Printing NB stats...") for label in sorted(labels): try: print("Class %s precision = %s" % (label, nb_with_cv_metrics_rdd.precision(label))) print("Class %s recall = %s" % (label, nb_with_cv_metrics_rdd.recall(label))) print("Class %s F1 Measure = %s" % (label, nb_with_cv_metrics_rdd.fMeasure(label, beta=1.0))) except Py4JJavaError: pass # Print weighted stats print("Weighted recall = %s" % nb_with_cv_metrics_rdd.weightedRecall) print("Weighted precision = %s" % nb_with_cv_metrics_rdd.weightedPrecision) print("Weighted F(1) Score = %s" % nb_with_cv_metrics_rdd.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % nb_with_cv_metrics_rdd.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % nb_with_cv_metrics_rdd.weightedFalsePositiveRate) # Show 10 results of cv_predictions that have been predicted successfully (cv_predictions.filter( cv_predictions['prediction'] == cv_predictions['label']).select( 'Product', 'ConsumerComplaint', 'probability', 'label', 'prediction').orderBy('probability', ascending=False).show(n=10, truncate=20)) # Show 10 results of cv_predictions that haven't been predicted successfully (cv_predictions.filter( cv_predictions['prediction'] != cv_predictions['label']).select( 'Product', 'ConsumerComplaint', 'probability', 'label', 'prediction').orderBy('probability', ascending=False).show(n=10, truncate=20)) # Timestamp of end end_timestamp = dt.now() # Print elapsed time print("Elapsed time: %s" % str(end_timestamp - start_timestamp)) # Stop SparkSession spark_session.stop()
for i in files['Contents']: outpatient_files.append('s3n://cms-data-1/' + i['Key']) dir = "Files/" os.environ[ 'PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell" config = configparser.ConfigParser() config.read(os.path.expanduser("~/.aws/credentials")) config['default'] access_id = config.get('default', "aws_access_key_id") access_key = config.get('default', "aws_secret_access_key") findspark.init() SparkContext.setSystemProperty('spark.driver.memory', '25G') SparkContext.setSystemProperty('spark.executor.memory', '15G') conf = SparkConf().setAppName('pyspark_model') conf = (conf.setMaster('local[*]').set('spark.executor.memory', '15G').set( 'spark.driver.memory', '25G').set('spark.driver.maxResultSize', '15G')) sc = SparkContext(conf=conf) spark = SparkSession(sc) hadoop_conf = sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id) hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.classification import DecisionTreeClassifier import os from time import time """ This script is about creating one vs rest model using Random forest classifier """ ########################################## ## Author: Kumar Awanish ### ########################################## memory = '4g' pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell' os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args SparkContext.setSystemProperty('spark.executor.memory', '4g') sc = SparkContext("local", "App Name") spark = SparkSession(sc) datapath = '/Users/akumarmandapati/git/iosl/CSVs/Allcsv/final.csv' datapath_DosGE = '/Users/akumarmandapati/git/Iosl/CSVs/Wednesday-workingHours.pcap_ISCX.csv' # Load and parse the data file, converting it to a DataFrame. try: print("Reading Dataset") dataset = spark.read.csv(datapath, header=True) except Exception as IOError: print("Error parsing file, check the path") print("Input N for not including Benign data") choice = input()
def start(self): epochTime = int(time.mktime(time.localtime())) randomNum = random.randint(0, 10000) self.__appName = "pyspark-" + str(epochTime) + "-" + str(randomNum) value = self.__args.get('spark.home') if not value: value = "/data/software/spark-2.3.1-bin-hadoop2.7" findspark.init(spark_home=value, python_path="python3") value = self.__args.get('spark.driver.memory') if not value: value = "3g" findspark.set_driver_mem(value) value = self.__args.get('spark.executor.memory') if not value: value = "3g" findspark.set_executor_mem(value) findspark.set_app_name(self.__appName) findspark.end() import pyspark from pyspark import SparkConf from pyspark.sql import SparkSession, SQLContext from pyspark.context import SparkContext if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() pySpark = None sc_conf = SparkConf() sc_conf.set('spark.locality.wait', 30000) sc_conf.set('spark.sql.autoBroadcastJoinThreshold', -1) sc_conf.set('spark.scheduler.minRegisteredResourcesRatio', 1) value = self.__args.get('spark.executor.cores') if not value: value = '1' sc_conf.set('spark.executor.cores', int(value)) value = self.__args.get('spark.executor.instances') if not value: value = '1' sc_conf.set('spark.executor.instances', int(value)) try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder.enableHiveSupport().config( conf=sc_conf).getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.config(conf=sc_conf).getOrCreate() except TypeError: spark = SparkSession.builder.config(conf=sc_conf).getOrCreate() sc = spark.sparkContext return spark, sc
from sklearn.metrics import mean_squared_error as mse import math # flag to confirm the writting of forecasted value to db real_flag = config.real_flag total_t1 = datetime.now() ## Logging ## import os import sys from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import SparkSession #import pyspark SparkContext.setSystemProperty('spark.executor.cores', '16') full_t1 = datetime.now() # initialise sparkContext #conf1 = pyspark.SparkConf().setAll([('spark.executor.memory', '24g'), ('spark.executor.cores', 8), ('spark.cores.max', 8), ('spark.driver.memory','24g')]) #spark2 = SparkSession.builder.config(conf=conf1).getOrCreate() spark1 = SparkSession.builder \ .master(config.sp_master) \ .appName(config.sp_appname) \ .config('spark.executor.memory', config.sp_memory) \ .config("spark.cores.max", config.sp_cores) \ .config('spark.executor.cores',config.sp_cores) \ .getOrCreate() sc = spark1.sparkContext
import math # flag to confirm the writting of forecasted value to db real_flag = config.real_flag total_t1 = datetime.now() ## Logging ## import os import sys from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.sql import SparkSession #import pyspark SparkContext.setSystemProperty('spark.executor.cores', '16') full_t1 = datetime.now() # initialise sparkContext #conf1 = pyspark.SparkConf().setAll([('spark.executor.memory', '24g'), ('spark.executor.cores', 8), ('spark.cores.max', 8), ('spark.driver.memory','24g')]) #spark2 = SparkSession.builder.config(conf=conf1).getOrCreate() spark1 = SparkSession.builder \ .master(config.sp_master) \ .appName(config.sp_appname) \ .config('spark.executor.memory', config.sp_memory) \ .config("spark.cores.max", config.sp_cores) \ .config('spark.executor.cores',config.sp_cores) \ .getOrCreate() sc = spark1.sparkContext