Python SparkFiles.getRootDirectory 예제들, pyspark.SparkFiles.getRootDirectory Python 예제들

예제 #1

0

파일 보기

파일: ddq.py 프로젝트: kjchavez/distributed-deep-q

def train_partition(idx, iterator):
    port = 50000 + idx % 256
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset-%02d.hdf5" % idx)

    flag_file = "flags/__BARISTA_READY__.%d" % port
    if os.path.isfile(flag_file):
        os.remove(flag_file)

    #  out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver,
                      "--dset-size", "30000",
                      "--initial-replay", "20000",
                      "--debug",
                      "--overwrite",
                      "--port", str(port)])

    while not os.path.isfile(flag_file):
        pass

    for step in iterator:
        dc = DummyClient("127.0.0.1", port)
        dc.send(barista.GRAD_UPDATE)
        response = dc.recv()
        yield response

예제 #2

0

파일 보기

파일: ddq.py 프로젝트: kjchavez/distributed-deep-q

def spawn_barista(partition):
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset.hdf5")
    flag_file = "flags/__BARISTA_READY__"
    if os.path.isfile(flag_file):
        os.remove("flags/__BARISTA_READY__")

    out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver],
                     stdout=out,
                     stderr=subprocess.STDOUT)

    while not os.path.isfile("flags/__BARISTA_READY__"):
        pass

예제 #3

0

파일 보기

from pyspark import SparkContext, SparkConf, SparkFiles
from graph import Graph

NUM_PARTITION = 4
#NUM_CORE = 4

graph = Graph()
if __name__ == '__main__':
    sc = SparkContext(master="local[*]", appName="A*-Spark")
    sc.addFile("node.py")
    sc.addFile("nodeitem.py")
    sc.addFile("graph.py")
    SparkFiles.getRootDirectory()

    # read edges from edge.edgelist file
    transition_table_rdd = sc.textFile("C:\edge20.edgelist").map(
        lambda line: graph.createEdgeProperty(line),
        preservesPartitioning=True).partitionBy(NUM_PARTITION).cache()

    # create state list rdd
    state_list_rdd = transition_table_rdd.groupByKey(
        numPartitions=NUM_PARTITION).mapValues(
            graph.getSucessorProperty).sortByKey().map(
                lambda val: graph.lineToNode(val)).cache()

    #share the node boundaries IDs
    boundary_nodes = graph.getBoundaryNodesId(state_list_rdd, NUM_PARTITION)
    shared_nodes = sc.broadcast(boundary_nodes)
    target_node = state_list_rdd.collect()[len(state_list_rdd.collect()) - 1]
    share_target_node = sc.broadcast(target_node)

예제 #4

0

파일 보기

파일: spark.py 프로젝트: kenBayama/AXAClim

def spark_env(app_name='my_spark_app',
              master='local[*]',
              jar_packages=[],
              files=[],
              spark_config={}):
    """Start Spark session, get Spark logger and load config files.
    Start a Spark session on the worker node and register the Spark
    application with the cluster. Note, that only the app_name argument
    will apply when this is called from a script sent to spark-submit.
    All other arguments exist solely for testing the script from within
    an interactive Python console.
    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration) into a dict of ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects and None
    for config.
    The function checks the enclosing environment to see if it is being
    run from inside an interactive console session or from an
    environment which has a `DEBUG` environment variable set (e.g.
    setting `DEBUG=1` as an environment variable as part of a debug
    configuration within an IDE such as Visual Studio Code or PyCharm.
    In this scenario, the function uses all available function arguments
    to start a PySpark driver from the local PySpark package as opposed
    to using the spark-submit and Spark cluster defaults. This will also
    use local module imports, as opposed to those in the zip archive
    sent to spark via the --py-files flag in spark-submit.
    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*]).
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = not (hasattr(__main__, '__file__'))
    flag_debug = 'DEBUG' in environ.keys()

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))

        spark_jars_packages = ','.join(list(jar_packages))

        spark_builder.config('spark.jars.packages', spark_jars_packages)

    else:

        # get Spark session factory
        spark_builder = (SparkSession.builder.master(master).appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))

        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()

    spark_logger = logging.Log4j(spark_sess)
    spark_jars_packages = ','.join(list(jar_packages))

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('configs.json')
    ]

    print(config_files)
    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        spark_logger.warn('no config file found')
        config_dict = None

    return spark_sess, spark_logger, config_dict

예제 #5

0

파일 보기

파일: data_source.py 프로젝트: nithin8702/CECL

    def __init__(self):
        """Create a spark context and session"""
        print('__init__ called')
        config = configparser.ConfigParser()
        print(
            '---------------------------------------------------Start-----------------------------------------------------------------------------------------------------'
        )
        sc = SparkContext.getOrCreate()
        print(sc.applicationId)
        print(sc.master)

        print('spark conf starts')
        spark = SparkSession(sc)
        for item in spark.sparkContext.getConf().getAll():
            print(item)
        print('spark conf ends')

        #        cwd = os.getcwd()
        #        arr = os.listdir(cwd)
        #        print('-------------------------------------------------------working directories-----------------------------------------------------------------------------------')
        #        print(cwd)
        #        print(arr)
        #        print('----------------------------------------------------------cmd starts-----------------------------------------------------------------------------------------')
        ##        cmd = 'hdfs dfs -ls /home/hadoop'.split() # cmd must be an array of arguments
        ##        files = subprocess.check_output(cmd).strip().split('\n')
        ##        for path in files:
        ##          print (path)
        #        print('-------------------------------applicationId-------------------------------------------------------------------------------------------------------------------')
        #        appId = sc.applicationId
        #        print(appId)
        #        ip = 'ip-172-31-18-164.ec2.internal'
        #        appPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.json'
        #        iniPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.ini'
        #        print(appPath)
        #        print(iniPath)
        #        tmp1 = 'file:///user/hadoop/.sparkStaging/' + appId + '/config.ini'
        #        print(tmp1)
        ##        print('-------------------------------------------------------------config reader starts-------------------------------------------------------------------------')
        ##        strng = open(appPath, 'r').read()
        ##        print(strng)
        #        print('----------------------------------open ends---------------------------------------------------------------------------------')
        #        print(SparkFiles.getRootDirectory())
        #        print(os.listdir(SparkFiles.getRootDirectory()))
        ##        cmd = 'hdfs dfs -ls ' + SparkFiles.getRootDirectory() +''.split()
        ##        files = subprocess.check_output('hdfs dfs -ls ' + SparkFiles.getRootDirectory()).strip().split('\n')
        ##        for path in files:
        ##          print (path)
        ##        stg_path = str(fs.defaultFS) + "/user/" + str(os.environ['USER']) + "/.sparkStaging/" + str(sc.applicationId) + "/" lines = sc.textFile(os.path.join(stg_path,'readme.txt'))
        ##        print(lines.collect())
        #        print('--------------------------------------------------getRootDirectory----------------------------------------------------------------------')
        #
        #
        ##        configFile = pkg_resources.resource_filename(pkg_resources.Requirement.parse("myapp"), "config.ini")
        ##        config = ConfigParser.ConfigParser()
        ##        config.read(configFile)

        #

        #        print('------------------INI file--------------------------------------------------------------------------------------------------------------------------')
        #        conString = ''
        #        inputFile = 'config.ini'
        #        with open(SparkFiles.get(inputFile)) as test_file:
        #            conString = test_file.read()
        #        print('----------------------------------------------------------------config reader ends---------------------------------------------------------------------')
        #        print(conString)
        #        print('print(conString) starts')
        #        config.read_string(conString.decode())
        #        print('print(conString) ends')
        #        print(config)
        #        val = config.get('SPARK', 'val')
        ##        val = config['SPARK']['val']
        #        print(val)
        #        print('----------------------------------------------------------val-----------------------------------------------------------------------------------------------')

        print(
            '------------------JSON file--------------------------------------------------------------------------------------------------------------------------'
        )
        inputFile = 'config.json'
        #        print(os.environ)
        #        print(os.environ['SPARK_YARN_STAGING_DIR'])
        #        print('os.environ completed')
        print(SparkFiles.getRootDirectory())
        ipath = os.path.join(SparkFiles.getRootDirectory() + '/' + inputFile)
        print(ipath)
        conString = ''

        print(
            '-----------------------------------------------------------printing a------------------------------------------------------------'
        )
        a = sc.textFile("file:///" + SparkFiles.get(inputFile)).collect()
        print(a)
        print(
            '-----------------------------------------------------------printing b------------------------------------------------------------'
        )
        b = sc.textFile("file:///" + ipath).collect()
        print(b)
        #

        with open(SparkFiles.get(inputFile)) as test_file:
            conString = test_file.read()
        print(
            '----------------------------------------------------------------config reader ends---------------------------------------------------------------------'
        )
        print(conString)
        print(
            '----------------------------------------------------------val-----------------------------------------------------------------------------------------------'
        )

        #hdfs://ip-172-31-19-25.ec2.internal:8020
        #        print('---------------------------------------------Hadoop Files------------------------------------------------------------------------------------------------------')
        ##        print(SparkFiles.getRootDirectory())

        #        print('------------------------------------------------------------Context------------------------------------------------------------------------------------------------')
        ##        textFile = sc.textFile(appPath)
        #        conf = spark.read.option("multiline",True).json(appPath)
        #        print('------------------------------------------------------------------------Spark Read---------------------------------------------------------------------------')
        #        print(conf.printSchema())
        #        print(conf.select('SPARK1').first()[0])
        #        print(sc.sparkUser())
        #        print('---------------------------------------------------------completed---------------------------------------------------------------------------------------')
        #        data = sc.parallelize(list(conf['SPARK']))
        data = sc.parallelize(list('HelloWorld12345'))
        data.map(lambda x: (x, 1)).reduceByKey(add).sortBy(
            lambda x: x[1], ascending=False).coalesce(1).saveAsTextFile(
                'tmp/result/' +
                str(ts))  #s3://nithin-emr/' + str(ts) + '/result'
        sc.stop()

예제 #6

0

파일 보기

def start_spark(app_name='my_spark_etl', master='local[*]', jar_packages=[],
                files=[], spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster. Note, that only the app_name argument
    will apply when this is called from a script sent to spark-submit.
    All other arguments exist solely for testing the script from within
    an interactive Python console.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration), into a dict of
     ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects and None
    for config.

    The function checks the enclosing environment to see if it is being
    run from inside an interactive console session or from an
    environment which has a `DEBUG` environment varibale set (e.g.
    setting `DEBUG=1` as an environment variable as part of a debug
    configuration within an IDE such as Visual Studio Code or PyCharm in
    In this scenario, the function uses all available function arguments
    to start a PySpark driver from the local PySpark package as opposed
    to using the spark-submit and Spark cluster defaults. This will also
    use local module imports, as opposed to those in the zip archive
    sent to spark via the --py-files flag in spark-submit.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = False if hasattr(__main__, '__file__') else True
    flag_debug = True if 'DEBUG' in environ.keys() else False

    warehouse_location = abspath('spark-warehouse')

    if not (flag_repl or flag_debug):
    # if False:
        # get Spark session factory
        spark_builder = (
            SparkSession
                .builder
                .appName(app_name)
                .config('spark.jars', 'configs/spark-avro_2.11-2.4.4.jar')
                )
        # Flag required for suppressing Hive table creation (create_database_table()) and Impala REFRESH

        environment = 'local'
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession
                .builder
                .master(master)
                .appName(app_name)
                .config("spark.sql.warehouse.dir", warehouse_location)
                .config('spark.jars', 'configs/spark-avro_2.11-2.4.4.jar')
                .enableHiveSupport()
                )
        # if inputfile.startswith('gs://') or outputfile.startswith('gs://'):
        spark_builder.config('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
        spark_builder.config('fs.AbstractFileSystem.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS')
        # spark_builder.config('spark.jars', '/tool/spark-2.4.5-bin-hadoop2.7/jars/avro-1.8.2.jar') \
        spark_builder.config('spark.jars', '../configs/spark-avro_2.11-2.4.4.jar') \

    # create Spark JAR packages string
    spark_jars_packages = ','.join(list(jar_packages))
    spark_builder.config('spark.jars.packages', spark_jars_packages)

    spark_files = ','.join(list(files))
    spark_builder.config('spark.files', spark_files)

    # add other config params
    for key, val in spark_config.items():
        spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        try:
            with open('configs/etl_config.json', 'r') as config_file:
                config_json = config_file.read().replace('\n', '')
            config_dict = loads(config_json)
        except FileNotFoundError:
            config_dict = None

    spark_logger.warn('environment: ' + environment )
    return spark_sess, spark_logger, config_dict, environment

예제 #7

0

파일 보기

def start_spark(app_name='flatten_json_spark',
                master='local[*]',
                jar_packages=None,
                files=None,
                spark_config=None):
    """Start Spark session, get the Spark logger and load config files.

    :param app_name: Name of the Spark.
    :param master: cluster connection details.
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and workers).
    :param spark_config: Dictionary of config key-value pairs
    :return:A tuple of references to the Spark session, logger and
    config dict (only if available).
    """
    if spark_config is None:
        spark_config = {}

    if jar_packages is None:
        jar_packages = []

    if files is None:
        files = []

    if __name__ == '__main__':
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name).master(master))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        # create file list to send it to master and worker
        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add config parameters if any
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark = spark_builder.getOrCreate()
    logger = Log4j(spark)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]
    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        logger.warn('loaded config from ' + config_files[0])
    else:
        config_dict = None

    # build return tuple conditional on presence of config
    if config_dict is not None:
        return_tup = spark, logger, config_dict
    else:
        return_tup = spark, logger
    return return_tup

예제 #8

0

파일 보기

def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
                files=[], spark_config={}):
    """Start Spark session, get the Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster. NOTE - only the app_name argument
    will apply when this is called from a script sent to spark-submit
    (i.e. when __name__ = '__main__'). All other arguments exist solely
    for testing the script from within an interactive Python console.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration), into a dict of ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
    workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
    config dict (only if available).
    """
    if __name__ == '__main__':
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .master(master)
            .appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        config_dict = None

    # build return tuple conditional on presence of config
    if config_dict is not None:
        return_tup = spark_sess, spark_logger, config_dict
    else:
        return_tup = spark_sess, spark_logger
    return return_tup

예제 #9

0

파일 보기

파일: spark_driver.py 프로젝트: aamirsahmad/gait-backend

def main():
    # start connection
    # configure spark instance to default
    global s_context
    global Logger
    global mylogger
    global s_context
    config = SparkConf()
    config.setAppName("Gait-Realtime-Analysis")
    s_context = SparkContext(conf=config)
    s_context.setLogLevel("ERROR")

    sys.path.insert(0, SparkFiles.getRootDirectory())

    s_context.addFile('./model/cnn_modell.h5')
    s_context.addFile("./data_transformation.py")
    # TODO: add logger to spark

    # use spark context to create the stream context
    # 5 seconds ensure that we get two overlapping samples of 4 seconds
    interval_seconds = 10
    s_stream_context = pss.StreamingContext(s_context, interval_seconds)
    s_stream_context.checkpoint("checkpoint_TSA")

    # with tf.gfile.GFile('./frozenInferenceGraphIdentification.pb', "rb") as f:
    #     model_data = f.read()

    # model_data_bc = s_context.broadcast(model_data)
    # model_data_bc = s_context.broadcast(loaded_model)

    # connect to port 9009 i.e. twitter-client
    print(API_SERVICE_URL + ' ' + SPARK_SOCKET_PORT)
    socket_ts = s_stream_context.socketTextStream(API_SERVICE_URL,
                                                  int(SPARK_SOCKET_PORT))

    print("\n################################\n")

    line = socket_ts.flatMap(lambda line: line.split("\n"))
    gait = line.map(lambda g: (getUserId(g).strip(), g.strip()))
    gaitByUserId = gait.groupByKey()

    sortedGaitByUserId = gaitByUserId.transform(
        lambda foo: foo.sortBy(lambda x: (x[0])))

    # sortedGaitByUserId = gaitByUserId.sortByKey()

    #     author_counts_sorted_dstream = author_counts.transform(\
    #   (lambda foo:foo\
    #    .sortBy(lambda x:( -x[1])) )
    #    )
    # author_counts_sorted_dstream.pprint()

    # sortedGaitByUserId.foreachRDD(another)

    segmentedData = sortedGaitByUserId.mapPartitions(partition_mapper_func)

    # x = cogrouped.mapValues(iterate)
    # for e in x.collect():
    #     print (e)

    # segmentedData.pprint()

    # DO NOT CHANGE THE LOCATION OF THIS FUNCTION
    def infer(data_rdd):
        # print("ATTEMPTING DEEP LEARNING")
        try:
            datas = data_rdd.collect()
            if len(datas) > 0:
                # print("INSIDE TRY BEFORE WITH")
                # with tf.Graph().as_default() as graph:
                #     graph_def = tf.GraphDef()
                #     graph_def.ParseFromString(model_data_bc.value)
                #     tf.import_graph_def(graph_def, name="prefix")
                # print("INSIDE TRY AFTER WITH")
                # x = graph.get_tensor_by_name('prefix/Placeholder:0')
                # y = graph.get_tensor_by_name('prefix/Softmax:0')

                for data in datas:
                    for id_xyz in data:
                        if id_xyz:
                            id = id_xyz[0]
                            dummy_axis = "0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00"
                            input_signals = []
                            input_signals.extend(id_xyz[1:])
                            for i in range(3):
                                input_signals.append(dummy_axis)

                            X_signals = []
                            for each in input_signals:
                                X_signals.append([
                                    np.array(cell, dtype=np.float32)
                                    for cell in [each.strip().split(' ')]
                                ])
                            X_test = np.transpose(np.array(X_signals),
                                                  (1, 2, 0))

                            from pyspark import SparkFiles
                            from tensorflow.keras.models import load_model
                            path = SparkFiles.get('cnn_modell.h5')
                            model = load_model(path)
                            print("Loaded model from disk")
                            preds = model.predict(X_test)
                            for p in preds:
                                inferred_user_id = str(np.argmax(p) + 1)
                                results = {
                                    'confidency': str(np.amax(p)),
                                    'inferred_user_id': inferred_user_id,
                                    'actual_user_id': str(id)
                                }
                                print(results)
                                requests.post(back_end_url, json=results)
                            # with tf.Session(graph=graph) as sess:
                            #     y_out = sess.run(y, feed_dict={
                            #         x: X_test
                            #     })

                            #     for each in y_out:
                            #         inferred_user_id = str(np.argmax(each) + 1)
                            #         confidency = str(np.amax(each))
                            #         actual_user_id = str(id)
                            #         results = {'confidency': confidency, 'inferred_user_id': inferred_user_id,
                            #                    'actual_user_id': actual_user_id}
                            #         print(results)
                            #         requests.post(back_end_url, json=results)
        except:
            e = sys.exc_info()
            print("Error: %s" % e)

    print('infer:', 'running inference on segmented data')
    segmentedData.foreachRDD(infer)

    # start the streaming computation
    s_stream_context.start()
    try:
        # wait for the streaming to finish
        s_stream_context.awaitTermination()
    except KeyboardInterrupt:
        print("\nSpark shutting down\n")

예제 #10

0

파일 보기

파일: main_chdir.py 프로젝트: ocassetti/spark-docker

from pyspark import SparkConf, SparkFiles
from pyspark.sql import SparkSession
import time
import os
import sys

conf = SparkConf().setAppName("test")
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
print("============")

for p, s, d in os.walk("/opt/spark/work-dir/"):
    print(f"{d}, {s}, {d}")

print("============")
currentDirectory = os.getcwd()
print(currentDirectory)
for p, s, d in os.walk(currentDirectory):
    print(f"{d}, {s}, {d}")

os.chdir(SparkFiles.getRootDirectory())

sys.path.append("lib.zip")
import test_spark.unit_a as unit_a
unit_a.hello()

with open("data.txt", 'r') as fh:
    print(fh.read())

#time.sleep(300)
spark.stop()

예제 #11

0

파일 보기

파일: SparkSessionBuilder.py 프로젝트: jayjayjohn/spark-kafka

    def getSparkSession(self, files=[], jars=[], spark_config={}):
        """
        setting spark config in code will take precendence over setting them in spark-submit

        https://spark.apache.org/docs/latest/submitting-applications.html#loading-configuration-from-a-file

        :param app_name:
        :param master:
        :param jars:
        :param files:
        :param spark_config:
        :return:
        """
        src_jars_path = os.path.join(self.root_dir, 'jars')
        src_jars = [
            os.path.join(src_jars_path, f) for f in os.listdir(src_jars_path)
        ]
        jars = ','.join(src_jars + jars)

        src_files_path = os.path.join(self.root_dir, 'files')
        src_files = [
            os.path.join(src_files_path, f) for f in os.listdir(src_files_path)
        ]
        files = ','.join(src_files)

        spark_builder = (
            SparkSession.builder
            # .master(master)
            # .appName(app_name)
            # .config('spark.executor.memory', '1g')
            # .config("spark.driver.memory", "1g")
            # .config("spark.memory.offHeap.size", "1g")
            .config("spark.jars", jars).config('spark.files', files)
            # .config("spark.hadoop.fs.s3a.endpoint", 'http://xxxx')
            # .config("spark.hadoop.fs.s3a.access.key", os.environ.get('MINIO_ACCESS_KEY_ID'))
            # .config("spark.hadoop.fs.s3a.secret.key", os.environ.get('MINIO_SECRET_ACCESS_KEY'))
            .config("spark.local.dir", self.temp_dir))
        for key, val in spark_config.items():
            spark_builder.config(key, val)

        spark = spark_builder.getOrCreate()
        spark_files_dir = SparkFiles.getRootDirectory()

        config_files = [
            os.path.join(spark_files_dir, filename)
            for filename in os.listdir(spark_files_dir)
            if filename.endswith('config.json')
        ][0]

        with open(config_files, 'r') as f:
            config_map = json.load(f)

        for key, val in config_map.items():

            spark.conf.set(key, val)

        print(spark.sparkContext.getConf().getAll())
        print(spark.conf.get("spark.cassandra.connection.host"))
        print(spark.conf.get("spark.hadoop.fs.s3a.access.key"))
        print(spark.conf)
        spark._jsc.had
        return spark

예제 #12

0

파일 보기

def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
                files=[], spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job.


    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*]).
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = not(hasattr(__main__, '__file__'))
    flag_debug = 'DEBUG' in environ.keys()

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .master(master)
            .appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging_utils.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        spark_logger.warn('no config file found')
        config_dict = None

    return spark_sess, spark_logger, config_dict

예제 #13

0

파일 보기

def _get_or_create_tmp_dir():
    root_dir = SparkFiles.getRootDirectory()
    xgb_tmp_dir = os.path.join(root_dir, "xgboost-tmp")
    if not os.path.exists(xgb_tmp_dir):
        os.makedirs(xgb_tmp_dir)
    return xgb_tmp_dir

예제 #14

0

파일 보기

def start_spark(app_name='hello_fresh_etl_job',
                master='local[*]',
                jar_packages=[],
                files=[],
                spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = False if hasattr(__main__, '__file__') else True
    flag_debug = True if 'DEBUG' in environ.keys() else False

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))
        environment = 'local'
    else:
        # get Spark session factory
        spark_builder = (SparkSession.builder.master(master).appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        try:
            with open('configs/etl_config.json', 'r') as config_file:
                config_json = config_file.read().replace('\n', '')
            config_dict = loads(config_json)
        except FileNotFoundError:
            config_dict = None

    return spark_sess, spark_logger, config_dict, environment

예제 #15

0

파일 보기

파일: sparkonda_utils.py 프로젝트: bossjones/sparkonda

def __ls(broadcast_vars, iterator):
    """
    Get the list of files in the worker-local directory
    """
    return [__get_hostname(), os.listdir(SparkFiles.getRootDirectory())]

예제 #16

0

파일 보기

from pyspark import SparkContext
import sys

sc = SparkContext.getOrCreate();

spark=SparkSession.builder \
    .master("<spark_master_node_IP>") \
    .appName("Image_IP_Protect") \
    .getOrCreate()

#Custom Modules
sc.addFile("import_img.py")
sc.addFile("hashing.py")

from pyspark import SparkConf, SparkContext, SparkFiles
sys.path.insert(0,SparkFiles.getRootDirectory())

import import_img as impg
import hashing as hs

#Pyspark Packages
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.linalg import SparseVector, DenseVector, Vectors
from pyspark.sql.functions import col
from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.clustering import KMeans, BisectingKMeans

#Python Packages
import numpy as np
from PIL import Image

예제 #17

0

파일 보기

from pyspark import SparkConf, SparkContext, SparkFiles
import sys

# Initiate SparkSession
spark = SparkSession\
    .builder\
    .appName("PythonPi")\
    .getOrCreate()

spark.sparkContext.addPyFile("/mnt/mesos/sandbox/sparkling-water-2.2.16.zip")

sys.path.insert(
    0,
    '/mnt/mesos/sandbox/sparkling-water-2.2.16/py/build/dist/h2o_pysparkling_2.2-2.2.16.zip'
)
print(SparkFiles.getRootDirectory())
print(help('modules'))
print('\n'.join(sys.path))

import h2o
from pysparkling import *
# Initiate H2OContext
hc = H2OContext.getOrCreate(spark)

pFile = spark.read.orc("s3a://dfs-lab13-ace/testLarge3/orctestdata/")

h2oFrame = hc.asH2OFrame(pFile)

# Stop H2O and Spark services
h2o.cluster().shutdown()
spark.stop()

예제 #18

0

파일 보기

파일: pyspark_etl_job_demo.py 프로젝트: yennanliu/utility

def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
                files=[], spark_config={}):
    """Start Spark session, get the Spark logger and load config files.
    Start a Spark session on the worker node and register the Spark
    application with the cluster. NOTE - only the app_name argument
    will apply when this is called from a script sent to spark-submit
    (i.e. when __name__ = '__main__'). All other arguments exist solely
    for testing the script from within an interactive Python console.
    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration), into a dict of ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects.
    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
    workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
    config dict (only if available).
    """
    if __name__ == '__main__':
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .master(master)
            .appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        config_dict = None

    # build return tuple conditional on presence of config
    if config_dict is not None:
        return_tup = spark_sess, spark_logger, config_dict
    else:
        return_tup = spark_sess, spark_logger
    return return_tup

예제 #19

0

파일 보기

def __ls(broadcast_vars, iterator):
    """
    Get the list of files in the worker-local directory
    """
    return [__get_hostname(), os.listdir(SparkFiles.getRootDirectory())]