Python SparkContext.setSystemProperty示例，pyspark.context.SparkContext.setSystemProperty Python示例

示例#1

0

显示文件

文件： spark.py 项目： paull87/ht-pipeline

def get_spark_context(workers="*", driver_memory=None, executor_memory=None):
    """
    This function sets up a local Spark context, configured for use with SQL Server and AWS S3.
    """

    # we need some libraries (jars) to connect to SQL Server and S3, so define this config
    jar_dir = r"C:\Jars"

    files = os.listdir(jar_dir)

    jars = [f for f in files if f.lower().endswith(".jar")]

    extra_class_path = ";".join([os.path.join(jar_dir, j) for j in jars])

    # setup spark context
    conf = SparkConf().setMaster(f"local[{workers}]") \
        .set("spark.driver.extraClassPath", extra_class_path) \
        .set("spark.executor.heartbeatInterval", "60s")

    if driver_memory:
        conf.set("spark.driver.memory", driver_memory)

    if executor_memory:
        conf.set("spark.executor.memory", executor_memory)

    spark_context = SparkContext(conf=conf)

    # we need to configure our s3 endpoint because our buckets are in London
    spark_context.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint",
                                                 "s3.eu-west-2.amazonaws.com")

    return spark_context

示例#2

0

显示文件

文件： bridge_utils.py 项目： aniket486/datafu

    def __init__(self):
        from py4j.java_gateway import java_import
        """When running a Python script from Scala - this function is called
        by the script to initialize the connection to the Java Gateway and get the spark context.
        code is basically copied from: 
        https://github.com/apache/zeppelin/blob/master/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py#L30
        """

        if os.environ.get("SPARK_EXECUTOR_URI"):
            SparkContext.setSystemProperty("spark.executor.uri",
                                           os.environ["SPARK_EXECUTOR_URI"])

        gateway = JavaGateway(
            GatewayClient(port=int(os.environ.get("PYSPARK_GATEWAY_PORT"))),
            auto_convert=True)
        java_import(gateway.jvm, "org.apache.spark.SparkEnv")
        java_import(gateway.jvm, "org.apache.spark.SparkConf")
        java_import(gateway.jvm, "org.apache.spark.api.java.*")
        java_import(gateway.jvm, "org.apache.spark.api.python.*")
        java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
        java_import(gateway.jvm, "org.apache.spark.sql.*")
        java_import(gateway.jvm, "org.apache.spark.sql.hive.*")

        intp = gateway.entry_point

        jSparkSession = intp.pyGetSparkSession()
        jsc = intp.pyGetJSparkContext(jSparkSession)
        jconf = intp.pyGetSparkConf(jsc)
        conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
        self.sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)

        # Spark 2
        self.sparkSession = SparkSession(self.sc, jSparkSession)
        self.sqlContext = self.sparkSession._wrapped

示例#3

0

显示文件

文件： ScalaSparkCalculationDelegate.py 项目： JordanKoeller/MirageOld

def _get_or_create_context():
    global _sc
    if not _sc:
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        settings = GlobalPreferences['spark_configuration']
        SparkContext.setSystemProperty("spark.executor.memory",settings['executor-memory'])
        SparkContext.setSystemProperty("spark.driver.memory",settings['driver-memory'])
        conf = SparkConf()
        conf = conf.setMaster(settings['master'])
        conf = conf.set('spark.driver.maxResultSize',settings['driver-memory'])
        _sc = SparkContext.getOrCreate(conf=conf)
        _sc.setLogLevel("WARN")
    return _sc

示例#4

0

显示文件

文件： shell.py 项目： MelissaCP1/ExamenU2

This file is designed to be launched as a PYTHONSTARTUP script.
"""

import atexit
import os
import platform

import py4j

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.storagelevel import StorageLevel

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
    spark = SparkSession.builder\
        .enableHiveSupport()\
        .getOrCreate()
except py4j.protocol.Py4JError:
    spark = SparkSession.builder.getOrCreate()
except TypeError:
    spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

示例#5

0

显示文件

文件： shell.py 项目： codeaudit/ipython-spark-docker

      # try again if port unavailable
      if check == notfound:
         port += 1

  # return the first available port
  return port


# this is the deprecated equivalent of ADD_JARS
add_files = None
if os.environ.get("ADD_FILES") is not None:
    add_files = os.environ.get("ADD_FILES").split(',')

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

# setup mesos-based connection
conf = (SparkConf()
         .setMaster(os.environ["SPARK_MASTER"]))

# set the UI port
conf.set("spark.ui.port", ui_get_available_port())

# configure docker containers as executors
conf.setSparkHome(os.environ.get("SPARK_HOME"))
conf.set("spark.mesos.executor.docker.image", "lab41/spark-mesos-dockerworker-ipython")
conf.set("spark.mesos.executor.home", "/usr/local/spark-1.4.1-bin-hadoop2.4")
conf.set("spark.executorEnv.MESOS_NATIVE_LIBRARY", "/usr/local/lib/libmesos.so")
conf.set("spark.network.timeout", "100")

示例#6

0

显示文件

文件： ComplaintClassificator.py 项目： nszysiak/ComplaintPrompt

def main():
    # Instantiate SparkConf and sent extraJavaOptions to both executors and drivers
    spark_conf = (SparkConf().set(
        'spark.executor.extraJavaOptions',
        '-Dcom.amazonaws.services.s3.enableV4=true').set(
            'spark.driver.extraJavaOptions',
            '-Dcom.amazonaws.services.s3.enableV4=true'))

    # Instantiate SparkContext based on SparkConf
    sc = SparkContext(conf=spark_conf)

    # Set enableV4 property to access S3 input data
    sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')

    # Create new Hadoop Configuration
    hadoopConf = sc._jsc.hadoopConfiguration()

    # Set Hadoop configuration K-V
    if is_not_blank(AWS_ACCESS_KEY_ID):
        hadoopConf.set('fs.s3a.awsAccessKeyId', AWS_ACCESS_KEY_ID)
    if is_not_blank(AWS_SECRET_ACCESS_KEY):
        hadoopConf.set('fs.s3a.awsSecretAccessKey', AWS_SECRET_ACCESS_KEY)
    hadoopConf.set('com.amazonaws.services.s3a.enableV4', 'true')
    hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')

    # Create SparkSession from SparkContext
    spark_session = (
        SparkSession(sc).builder.appName('ComplaintClassificator').config(
            conf=spark_conf).getOrCreate())

    # Timestamp of start
    start_timestamp = dt.now()

    # Instantiate SparkContext
    sc = spark_session.sparkContext

    # Instantiate SQLContext
    sql_ctx = SQLContext(sc)

    # Set log level to 'WARN'
    sc.setLogLevel('WARN')

    # Set up log4j logging
    log4j_logger = sc._jvm.org.apache.log4j
    logger = log4j_logger.LogManager.getLogger(__name__)

    # Create schema as a StructType of StructField(s)
    schema = StructType([
        StructField('ReceivedDate', StringType(), True),
        StructField('Product', StringType(), True),
        StructField('Subproduct', StringType(), True),
        StructField('Issue', StringType(), True),
        StructField('Subissue', StringType(), True),
        StructField('ConsumerComplaintNarrative', StringType(), True),
        StructField('CompanyPublicResponse', StringType(), True),
        StructField('CompanyName', StringType(), True),
        StructField('State', StringType(), True),
        StructField('ZipCode', IntegerType(), True),
        StructField('Tags', StringType(), True),
        StructField('IsConsumerConsent', StringType(), True),
        StructField('SubmittedVia', StringType(), True),
        StructField('SentDate', StringType(), True),
        StructField('CompanyResponseToConsument', StringType(), True),
        StructField('IsTimelyResponse', StringType(), True),
        StructField('IsConsumerDisputed', StringType(), True),
        StructField('ComplaintId', IntegerType(), True)
    ])

    logger.warn("Starting preprocessing and data cleansing...")

    # Read Consumer_Complaints.csv file and apply schema
    complaint_df = (spark_session.read.format('csv').option(
        'header',
        'true').option('delimiter', ',').option('mode', 'FAILFAST').option(
            'parserLib', 'univocity').option('escape', '"').option(
                'multiLine', 'true').option('inferSchema', 'false').schema(
                    schema).load(CONSUMER_COMPLAINTS).alias('complaint_df'))

    logger.warn("Explaining complaint_df...")
    complaint_df.explain()

    logger.warn("complaint_df has %d records, %d columns." %
                (complaint_df.count(), len(complaint_df.columns)))
    logger.warn("Printing schema of complaint_df: ")
    complaint_df.printSchema()

    # Register cleanse_files function as an UDF (UserDefinedFunction)
    udf_cleansed_field = udf(cleanse_field, StringType())

    # Provide a lambda function to format date-type field to 'YYYY-MM-DD' pattern
    change_data_format = udf(lambda x: dt.strptime(x, '%m/%d/%Y'), DateType())

    # Do some clean-up activities
    cleansed_df = (complaint_df.withColumn(
        'Issue', udf_cleansed_field(
            complaint_df['ConsumerComplaintNarrative'])).withColumn(
                'ReceivedDate',
                change_data_format(complaint_df['ReceivedDate'])))

    logger.warn("Explaining cleansed_df...")
    cleansed_df.explain()

    logger.warn("cleansed_init_df has %d records, %d columns." %
                (cleansed_df.count(), len(cleansed_df.columns)))
    logger.warn("Printing schema of cleansed_df: ")
    cleansed_df.printSchema()

    # Reduce a number of fields and filter non-null values out on consumer complaint narratives
    final_complaints_df = (cleansed_df.where(
        cleansed_df['ConsumerComplaintNarrative'].isNotNull()).select(
            'ComplaintId', 'ReceivedDate', 'State', 'Product',
            'ConsumerComplaintNarrative',
            'Issue').orderBy(cleansed_df['ReceivedDate']))

    final_complaints_df.registerTempTable("final_complaints_df")

    # Check random ConsumerComplaintNarrative as well as Issue content
    sql_ctx.sql(""" SELECT RowNum, ConsumerComplaintNarrative, Issue FROM
                    (SELECT ROW_NUMBER() OVER (PARTITION BY State ORDER BY ReceivedDate DESC) AS RowNum,
                        ConsumerComplaintNarrative,
                        Issue,
                        ReceivedDate,
                        State
                    FROM final_complaints_df) fc
                    WHERE RowNum = 1
                    LIMIT 10
                    """).show()

    logger.warn("Explaining final_complaints_df...")
    final_complaints_df.explain()

    logger.warn(
        "final_complaints has %d records, %d columns." %
        (final_complaints_df.count(), len(final_complaints_df.columns)))
    logger.warn("Printing schema of final_complaints_df: ")
    final_complaints_df.printSchema()

    # Read states json provider as a states_df DataFrame abstraction
    states_df = (spark_session.read.json(AMERICAN_STATES,
                                         multiLine=True).alias('states_df'))

    logger.warn("Explaining states_df...")
    states_df.explain()

    logger.warn("states_df has %d records, %d columns." %
                (states_df.count(), len(states_df.columns)))
    logger.warn("Printing schema of states_df: ")
    states_df.printSchema()

    # List of fields to drop (not needed for the further processing)
    drop_list = ['state', 'abbreviation']

    # Join complaints data with American states, apply id field and drop unnecessary fields
    joined_df = (final_complaints_df.join(
        broadcast(states_df),
        col('complaint_df.State') == col('states_df.abbreviation'),
        "left").withColumnRenamed('ConsumerComplaintNarrative',
                                  'ConsumerComplaint').withColumn(
                                      'RowNoIndex',
                                      monotonically_increasing_id()).select(
                                          'Product', 'ConsumerComplaint',
                                          'name').drop(*drop_list))

    joined_df.registerTempTable("joined_df")

    # Check random FullStateName content
    sql_ctx.sql(
        """ SELECT RowNum, Product, ConsumerComplaint, FullStateName FROM
                        (SELECT ROW_NUMBER() OVER (PARTITION BY Product ORDER BY ConsumerComplaint DESC) AS RowNum,
                            Product,
                            ConsumerComplaint,
                            name AS FullStateName
                        FROM joined_df) jd
                        WHERE RowNum = 1
                        LIMIT 10
                        """).show()

    logger.warn("Explaining joined_df...")
    joined_df.explain()

    logger.warn("joined_df has %d records, %d columns." %
                (joined_df.count(), len(joined_df.columns)))
    logger.warn("Printing schema of joined_df: ")
    joined_df.printSchema()

    # Check unique labels of Product attribute before replace
    joined_df.select('Product').distinct().show()

    # Replace redundant labels from Product field
    renamed_df = (joined_df.withColumn(
        'Product',
        regexp_replace(
            'Product',
            "Credit reporting, credit repair services, or other personal consumer reports",
            "Credit reporting, repair, or other")
    ).withColumn(
        'Product',
        regexp_replace("Product", "Virtual currency",
                       "Money transfer, virtual currency, or money service")
    ).withColumn(
        'Product',
        regexp_replace(
            "Product", "Money transfer",
            "Money transfer, virtual currency, or money service")).withColumn(
                'Product',
                regexp_replace(
                    "Product", "Payday loan",
                    "Payday loan, title loan, or personal loan")).withColumn(
                        'Product',
                        regexp_replace(
                            "Product", "Credit reporting",
                            "Credit reporting, repair, or other")).withColumn(
                                'Product',
                                regexp_replace(
                                    "Product", "Prepaid card",
                                    "Credit card or prepaid card")).withColumn(
                                        'Product',
                                        regexp_replace(
                                            "Product", "Credit card",
                                            "Credit card or prepaid card")))

    renamed_df.registerTempTable("renamed_df")

    # Check how many unique labels (classes) there are
    sql_ctx.sql(""" SELECT DISTINCT Product FROM renamed_df """).show()

    # Check how many times each class occurs in the corpus
    sql_ctx.sql(""" SELECT Product, count(*) 
    FROM renamed_df GROUP BY Product 
    ORDER BY count(*) DESC""").show(50, False)

    logger.warn("Explaining renamed_df...")
    renamed_df.explain()

    # Check unique labels of Product attribute after replace
    renamed_df.select('Product').distinct().show()

    # Check amount of unique labels of Product attribute after replace
    logger.warn(str(renamed_df.select('Product').distinct().count()))

    logger.warn("Starting feature extraction...")

    # Tokenize consumer complaints sentences
    tokenizer = Tokenizer(inputCol='ConsumerComplaint', outputCol='Words')

    # Remove stop words
    remover = StopWordsRemover(inputCol='Words', outputCol='FilteredWords')

    # num_features = 700
    hashing_tf = HashingTF(inputCol='FilteredWords', outputCol='RawFeatures')

    # minDocFreq: minimum number of documents in which a term should appear for filtering
    idf = IDF(inputCol='RawFeatures', outputCol='features')

    # Instantiate StringIndexer
    product_indexer = StringIndexer(inputCol='Product', outputCol='label')

    # Create a pipeline from previously defined feature extraction stages
    pipeline = Pipeline(
        stages=[tokenizer, remover, hashing_tf, idf, product_indexer])

    # Fit renamed_df to the pipeline
    pipeline_fit = pipeline.fit(renamed_df)

    # Transform pipeline_fit
    data_set = pipeline_fit.transform(renamed_df)

    # Randomly slice the data into training and test datasets with requested ratio
    (training_data, test_data) = data_set.randomSplit([0.7, 0.3], seed=100)

    # Cache training_data
    training_data.cache()

    logger.warn("Starting Naive-Bayes...")

    # Naive-Bayes
    nb = NaiveBayes(labelCol='label',
                    featuresCol='features',
                    modelType='multinomial')

    # Create a model without Cross Validation
    nb_model = nb.fit(training_data)

    # Make predictions on model without Cross Validation
    predictions = nb_model.transform(test_data)

    print("NaiveBayes without CV model type: ", nb.getModelType())
    print("NaiveBayes without CV smoothing factor: ", str(nb.getSmoothing()))

    # NB without CV metrics
    nb_metrics_rdd = MulticlassMetrics(predictions['label', 'prediction'].rdd)

    # NB stats by each class (label)
    labels = predictions.rdd.map(lambda cols: cols.label).distinct().collect()

    logger.warn("Printing NB stats...")

    for label in sorted(labels):
        try:
            print("Class %s precision = %s" %
                  (label, nb_metrics_rdd.precision(label)))
            print("Class %s recall = %s" %
                  (label, nb_metrics_rdd.recall(label)))
            print("Class %s F1 Measure = %s" %
                  (label, nb_metrics_rdd.fMeasure(label, beta=1.0)))
        except Py4JJavaError:
            pass

    # Weighted stats
    print("Weighted recall = %s" % nb_metrics_rdd.weightedRecall)
    print("Weighted precision = %s" % nb_metrics_rdd.weightedPrecision)
    print("Weighted F(1) Score = %s" % nb_metrics_rdd.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" %
          nb_metrics_rdd.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          nb_metrics_rdd.weightedFalsePositiveRate)

    # Show 10 results of predictions that haven't been predicted successfully
    predictions.filter(predictions['prediction'] != predictions['label']) \
        .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \
        .orderBy("probability", ascending=False) \
        .show(n=10, truncate=20)

    # Show 10 results of predictions that have been predicted successfully
    predictions.filter(predictions['prediction'] == predictions['label']) \
        .select("Product", "ConsumerComplaint", "probability", "label", "prediction") \
        .orderBy("probability", ascending=False) \
        .show(n=10, truncate=20)

    # Instantiate an evaluation of predictions without Cross Validation
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")

    # Evaluate best model without an use of Cross Validation
    accuracy_without_cv = evaluator.evaluate(predictions)

    print("Naive-Bayes accuracy without Cross Validation = %s (metric)" %
          str(nb_metrics_rdd.accuracy))

    logger.warn("Starting Cross Validation...")

    # Instantiate ParamGridBuilder for the Cross Validation purpose
    nbp_params_grid = (ParamGridBuilder().addGrid(
        nb.smoothing,
        [0.8, 0.9, 1.0]).addGrid(hashing_tf.numFeatures,
                                 [700, 720]).addGrid(idf.minDocFreq,
                                                     [3, 4, 5]).build())

    # Instantiate the Evaluator of the model
    nb_evaluator = MulticlassClassificationEvaluator(
        labelCol='label', predictionCol='prediction')

    # Instantiate 5-fold CrossValidator
    nb_cv = CrossValidator(estimator=nb,
                           estimatorParamMaps=nbp_params_grid,
                           evaluator=nb_evaluator,
                           numFolds=5)

    # Create a model with Cross Validation
    nb_cv_model = nb_cv.fit(training_data)

    # Make predictions on model with Cross Validation
    cv_predictions = nb_cv_model.transform(training_data)

    # Evaluate best model with an use of Cross Validation
    accuracy_with_cv = nb_evaluator.evaluate(cv_predictions)

    print("Naive-Bayes accuracy with Cross Validation:", str(accuracy_with_cv))

    print(
        "Improvement for the best fitted model (NB with CV) in regard of NB: ",
        str(accuracy_with_cv - nb_metrics_rdd.accuracy))

    # NB with CV metrics
    nb_with_cv_metrics_rdd = MulticlassMetrics(
        cv_predictions['label', 'prediction'].rdd)

    # NB with CV stats by each class (label)
    labels = cv_predictions.rdd.map(lambda att: att.label).distinct().collect()

    logger.warn("Printing NB stats...")

    for label in sorted(labels):
        try:
            print("Class %s precision = %s" %
                  (label, nb_with_cv_metrics_rdd.precision(label)))
            print("Class %s recall = %s" %
                  (label, nb_with_cv_metrics_rdd.recall(label)))
            print("Class %s F1 Measure = %s" %
                  (label, nb_with_cv_metrics_rdd.fMeasure(label, beta=1.0)))
        except Py4JJavaError:
            pass

    # Print weighted stats
    print("Weighted recall = %s" % nb_with_cv_metrics_rdd.weightedRecall)
    print("Weighted precision = %s" % nb_with_cv_metrics_rdd.weightedPrecision)
    print("Weighted F(1) Score = %s" %
          nb_with_cv_metrics_rdd.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" %
          nb_with_cv_metrics_rdd.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          nb_with_cv_metrics_rdd.weightedFalsePositiveRate)

    # Show 10 results of cv_predictions that have been predicted successfully
    (cv_predictions.filter(
        cv_predictions['prediction'] == cv_predictions['label']).select(
            'Product', 'ConsumerComplaint', 'probability', 'label',
            'prediction').orderBy('probability',
                                  ascending=False).show(n=10, truncate=20))

    # Show 10 results of cv_predictions that haven't been predicted successfully
    (cv_predictions.filter(
        cv_predictions['prediction'] != cv_predictions['label']).select(
            'Product', 'ConsumerComplaint', 'probability', 'label',
            'prediction').orderBy('probability',
                                  ascending=False).show(n=10, truncate=20))

    # Timestamp of end
    end_timestamp = dt.now()

    # Print elapsed time
    print("Elapsed time: %s" % str(end_timestamp - start_timestamp))

    # Stop SparkSession
    spark_session.stop()

示例#7

0

显示文件

文件： pyspark_model.py 项目： fhebal/healthcare-analytics-pipeline

for i in files['Contents']:
    outpatient_files.append('s3n://cms-data-1/' + i['Key'])

dir = "Files/"

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
config['default']
access_id = config.get('default', "aws_access_key_id")
access_key = config.get('default', "aws_secret_access_key")

findspark.init()
SparkContext.setSystemProperty('spark.driver.memory', '25G')
SparkContext.setSystemProperty('spark.executor.memory', '15G')
conf = SparkConf().setAppName('pyspark_model')
conf = (conf.setMaster('local[*]').set('spark.executor.memory', '15G').set(
    'spark.driver.memory', '25G').set('spark.driver.maxResultSize', '15G'))

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl",
                "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)

示例#8

0

显示文件

文件： one_vs_rest.py 项目： saikamat/IntrusionDetectionUsingML

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import DecisionTreeClassifier
import os
from time import time
"""
This script is about creating one vs rest model using Random forest classifier
"""

##########################################
## Author:  Kumar Awanish ###
##########################################
memory = '4g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

SparkContext.setSystemProperty('spark.executor.memory', '4g')
sc = SparkContext("local", "App Name")
spark = SparkSession(sc)

datapath = '/Users/akumarmandapati/git/iosl/CSVs/Allcsv/final.csv'
datapath_DosGE = '/Users/akumarmandapati/git/Iosl/CSVs/Wednesday-workingHours.pcap_ISCX.csv'

# Load and parse the data file, converting it to a DataFrame.
try:
    print("Reading Dataset")
    dataset = spark.read.csv(datapath, header=True)
except Exception as IOError:
    print("Error parsing file, check the path")

print("Input N for not including Benign data")
choice = input()

示例#9

0

显示文件

文件： run_pyspark.py 项目： DavidXubin/machine_learning

    def start(self):
        epochTime = int(time.mktime(time.localtime()))
        randomNum = random.randint(0, 10000)
        self.__appName = "pyspark-" + str(epochTime) + "-" + str(randomNum)

        value = self.__args.get('spark.home')
        if not value:
            value = "/data/software/spark-2.3.1-bin-hadoop2.7"

        findspark.init(spark_home=value, python_path="python3")

        value = self.__args.get('spark.driver.memory')
        if not value:
            value = "3g"
        findspark.set_driver_mem(value)

        value = self.__args.get('spark.executor.memory')
        if not value:
            value = "3g"
        findspark.set_executor_mem(value)

        findspark.set_app_name(self.__appName)
        findspark.end()

        import pyspark
        from pyspark import SparkConf
        from pyspark.sql import SparkSession, SQLContext

        from pyspark.context import SparkContext
        if os.environ.get("SPARK_EXECUTOR_URI"):
            SparkContext.setSystemProperty("spark.executor.uri",
                                           os.environ["SPARK_EXECUTOR_URI"])

        SparkContext._ensure_initialized()
        pySpark = None

        sc_conf = SparkConf()
        sc_conf.set('spark.locality.wait', 30000)
        sc_conf.set('spark.sql.autoBroadcastJoinThreshold', -1)
        sc_conf.set('spark.scheduler.minRegisteredResourcesRatio', 1)

        value = self.__args.get('spark.executor.cores')
        if not value:
            value = '1'
        sc_conf.set('spark.executor.cores', int(value))

        value = self.__args.get('spark.executor.instances')
        if not value:
            value = '1'
        sc_conf.set('spark.executor.instances', int(value))

        try:
            # Try to access HiveConf, it will raise exception if Hive is not added
            SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
            spark = SparkSession.builder.enableHiveSupport().config(
                conf=sc_conf).getOrCreate()

        except py4j.protocol.Py4JError:
            spark = SparkSession.builder.config(conf=sc_conf).getOrCreate()
        except TypeError:
            spark = SparkSession.builder.config(conf=sc_conf).getOrCreate()

        sc = spark.sparkContext

        return spark, sc

示例#10

0

显示文件

文件： p8_final.py 项目： abhoopathi/friendly-lamp

from sklearn.metrics import mean_squared_error as mse
import math
# flag to confirm the writting of forecasted value to db
real_flag = config.real_flag
total_t1 = datetime.now()
## Logging ##

import os
import sys

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql import SparkSession
#import pyspark
SparkContext.setSystemProperty('spark.executor.cores', '16')
full_t1 = datetime.now()
# initialise sparkContext

#conf1 = pyspark.SparkConf().setAll([('spark.executor.memory', '24g'), ('spark.executor.cores', 8), ('spark.cores.max', 8), ('spark.driver.memory','24g')])
#spark2 = SparkSession.builder.config(conf=conf1).getOrCreate()

spark1 = SparkSession.builder \
    .master(config.sp_master) \
    .appName(config.sp_appname) \
    .config('spark.executor.memory', config.sp_memory) \
    .config("spark.cores.max", config.sp_cores) \
    .config('spark.executor.cores',config.sp_cores) \
    .getOrCreate()

sc = spark1.sparkContext

示例#11

0

显示文件

文件： p8_final.py 项目： abhoopathi/friendly-lamp

import math
# flag to confirm the writting of forecasted value to db
real_flag = config.real_flag
total_t1 = datetime.now()
## Logging ##

import os
import sys


from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql import SparkSession
#import pyspark
SparkContext.setSystemProperty('spark.executor.cores', '16')
full_t1 = datetime.now()
# initialise sparkContext

#conf1 = pyspark.SparkConf().setAll([('spark.executor.memory', '24g'), ('spark.executor.cores', 8), ('spark.cores.max', 8), ('spark.driver.memory','24g')])
#spark2 = SparkSession.builder.config(conf=conf1).getOrCreate()

spark1 = SparkSession.builder \
    .master(config.sp_master) \
    .appName(config.sp_appname) \
    .config('spark.executor.memory', config.sp_memory) \
    .config("spark.cores.max", config.sp_cores) \
    .config('spark.executor.cores',config.sp_cores) \
    .getOrCreate()

sc = spark1.sparkContext