def train_partition(idx, iterator): port = 50000 + idx % 256 main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset-%02d.hdf5" % idx) flag_file = "flags/__BARISTA_READY__.%d" % port if os.path.isfile(flag_file): os.remove(flag_file) # out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver, "--dset-size", "30000", "--initial-replay", "20000", "--debug", "--overwrite", "--port", str(port)]) while not os.path.isfile(flag_file): pass for step in iterator: dc = DummyClient("127.0.0.1", port) dc.send(barista.GRAD_UPDATE) response = dc.recv() yield response
def spawn_barista(partition): main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset.hdf5") flag_file = "flags/__BARISTA_READY__" if os.path.isfile(flag_file): os.remove("flags/__BARISTA_READY__") out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver], stdout=out, stderr=subprocess.STDOUT) while not os.path.isfile("flags/__BARISTA_READY__"): pass
from pyspark import SparkContext, SparkConf, SparkFiles from graph import Graph NUM_PARTITION = 4 #NUM_CORE = 4 graph = Graph() if __name__ == '__main__': sc = SparkContext(master="local[*]", appName="A*-Spark") sc.addFile("node.py") sc.addFile("nodeitem.py") sc.addFile("graph.py") SparkFiles.getRootDirectory() # read edges from edge.edgelist file transition_table_rdd = sc.textFile("C:\edge20.edgelist").map( lambda line: graph.createEdgeProperty(line), preservesPartitioning=True).partitionBy(NUM_PARTITION).cache() # create state list rdd state_list_rdd = transition_table_rdd.groupByKey( numPartitions=NUM_PARTITION).mapValues( graph.getSucessorProperty).sortByKey().map( lambda val: graph.lineToNode(val)).cache() #share the node boundaries IDs boundary_nodes = graph.getBoundaryNodesId(state_list_rdd, NUM_PARTITION) shared_nodes = sc.broadcast(boundary_nodes) target_node = state_list_rdd.collect()[len(state_list_rdd.collect()) - 1] share_target_node = sc.broadcast(target_node)
def spark_env(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. Note, that only the app_name argument will apply when this is called from a script sent to spark-submit. All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration) into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None for config. The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an environment which has a `DEBUG` environment variable set (e.g. setting `DEBUG=1` as an environment variable as part of a debug configuration within an IDE such as Visual Studio Code or PyCharm. In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also use local module imports, as opposed to those in the zip archive sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = not (hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) else: # get Spark session factory spark_builder = (SparkSession.builder.master(master).appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) spark_jars_packages = ','.join(list(jar_packages)) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('configs.json') ] print(config_files) if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def __init__(self): """Create a spark context and session""" print('__init__ called') config = configparser.ConfigParser() print( '---------------------------------------------------Start-----------------------------------------------------------------------------------------------------' ) sc = SparkContext.getOrCreate() print(sc.applicationId) print(sc.master) print('spark conf starts') spark = SparkSession(sc) for item in spark.sparkContext.getConf().getAll(): print(item) print('spark conf ends') # cwd = os.getcwd() # arr = os.listdir(cwd) # print('-------------------------------------------------------working directories-----------------------------------------------------------------------------------') # print(cwd) # print(arr) # print('----------------------------------------------------------cmd starts-----------------------------------------------------------------------------------------') ## cmd = 'hdfs dfs -ls /home/hadoop'.split() # cmd must be an array of arguments ## files = subprocess.check_output(cmd).strip().split('\n') ## for path in files: ## print (path) # print('-------------------------------applicationId-------------------------------------------------------------------------------------------------------------------') # appId = sc.applicationId # print(appId) # ip = 'ip-172-31-18-164.ec2.internal' # appPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.json' # iniPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.ini' # print(appPath) # print(iniPath) # tmp1 = 'file:///user/hadoop/.sparkStaging/' + appId + '/config.ini' # print(tmp1) ## print('-------------------------------------------------------------config reader starts-------------------------------------------------------------------------') ## strng = open(appPath, 'r').read() ## print(strng) # print('----------------------------------open ends---------------------------------------------------------------------------------') # print(SparkFiles.getRootDirectory()) # print(os.listdir(SparkFiles.getRootDirectory())) ## cmd = 'hdfs dfs -ls ' + SparkFiles.getRootDirectory() +''.split() ## files = subprocess.check_output('hdfs dfs -ls ' + SparkFiles.getRootDirectory()).strip().split('\n') ## for path in files: ## print (path) ## stg_path = str(fs.defaultFS) + "/user/" + str(os.environ['USER']) + "/.sparkStaging/" + str(sc.applicationId) + "/" lines = sc.textFile(os.path.join(stg_path,'readme.txt')) ## print(lines.collect()) # print('--------------------------------------------------getRootDirectory----------------------------------------------------------------------') # # ## configFile = pkg_resources.resource_filename(pkg_resources.Requirement.parse("myapp"), "config.ini") ## config = ConfigParser.ConfigParser() ## config.read(configFile) # # print('------------------INI file--------------------------------------------------------------------------------------------------------------------------') # conString = '' # inputFile = 'config.ini' # with open(SparkFiles.get(inputFile)) as test_file: # conString = test_file.read() # print('----------------------------------------------------------------config reader ends---------------------------------------------------------------------') # print(conString) # print('print(conString) starts') # config.read_string(conString.decode()) # print('print(conString) ends') # print(config) # val = config.get('SPARK', 'val') ## val = config['SPARK']['val'] # print(val) # print('----------------------------------------------------------val-----------------------------------------------------------------------------------------------') print( '------------------JSON file--------------------------------------------------------------------------------------------------------------------------' ) inputFile = 'config.json' # print(os.environ) # print(os.environ['SPARK_YARN_STAGING_DIR']) # print('os.environ completed') print(SparkFiles.getRootDirectory()) ipath = os.path.join(SparkFiles.getRootDirectory() + '/' + inputFile) print(ipath) conString = '' print( '-----------------------------------------------------------printing a------------------------------------------------------------' ) a = sc.textFile("file:///" + SparkFiles.get(inputFile)).collect() print(a) print( '-----------------------------------------------------------printing b------------------------------------------------------------' ) b = sc.textFile("file:///" + ipath).collect() print(b) # with open(SparkFiles.get(inputFile)) as test_file: conString = test_file.read() print( '----------------------------------------------------------------config reader ends---------------------------------------------------------------------' ) print(conString) print( '----------------------------------------------------------val-----------------------------------------------------------------------------------------------' ) #hdfs://ip-172-31-19-25.ec2.internal:8020 # print('---------------------------------------------Hadoop Files------------------------------------------------------------------------------------------------------') ## print(SparkFiles.getRootDirectory()) # print('------------------------------------------------------------Context------------------------------------------------------------------------------------------------') ## textFile = sc.textFile(appPath) # conf = spark.read.option("multiline",True).json(appPath) # print('------------------------------------------------------------------------Spark Read---------------------------------------------------------------------------') # print(conf.printSchema()) # print(conf.select('SPARK1').first()[0]) # print(sc.sparkUser()) # print('---------------------------------------------------------completed---------------------------------------------------------------------------------------') # data = sc.parallelize(list(conf['SPARK'])) data = sc.parallelize(list('HelloWorld12345')) data.map(lambda x: (x, 1)).reduceByKey(add).sortBy( lambda x: x[1], ascending=False).coalesce(1).saveAsTextFile( 'tmp/result/' + str(ts)) #s3://nithin-emr/' + str(ts) + '/result' sc.stop()
def start_spark(app_name='my_spark_etl', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. Note, that only the app_name argument will apply when this is called from a script sent to spark-submit. All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None for config. The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an environment which has a `DEBUG` environment varibale set (e.g. setting `DEBUG=1` as an environment variable as part of a debug configuration within an IDE such as Visual Studio Code or PyCharm in In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also use local module imports, as opposed to those in the zip archive sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = False if hasattr(__main__, '__file__') else True flag_debug = True if 'DEBUG' in environ.keys() else False warehouse_location = abspath('spark-warehouse') if not (flag_repl or flag_debug): # if False: # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name) .config('spark.jars', 'configs/spark-avro_2.11-2.4.4.jar') ) # Flag required for suppressing Hive table creation (create_database_table()) and Impala REFRESH environment = 'local' else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name) .config("spark.sql.warehouse.dir", warehouse_location) .config('spark.jars', 'configs/spark-avro_2.11-2.4.4.jar') .enableHiveSupport() ) # if inputfile.startswith('gs://') or outputfile.startswith('gs://'): spark_builder.config('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem') spark_builder.config('fs.AbstractFileSystem.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS') # spark_builder.config('spark.jars', '/tool/spark-2.4.5-bin-hadoop2.7/jars/avro-1.8.2.jar') \ spark_builder.config('spark.jars', '../configs/spark-avro_2.11-2.4.4.jar') \ # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: try: with open('configs/etl_config.json', 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) except FileNotFoundError: config_dict = None spark_logger.warn('environment: ' + environment ) return spark_sess, spark_logger, config_dict, environment
def start_spark(app_name='flatten_json_spark', master='local[*]', jar_packages=None, files=None, spark_config=None): """Start Spark session, get the Spark logger and load config files. :param app_name: Name of the Spark. :param master: cluster connection details. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs :return:A tuple of references to the Spark session, logger and config dict (only if available). """ if spark_config is None: spark_config = {} if jar_packages is None: jar_packages = [] if files is None: files = [] if __name__ == '__main__': # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) else: # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name).master(master)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) # create file list to send it to master and worker spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add config parameters if any for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark = spark_builder.getOrCreate() logger = Log4j(spark) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) logger.warn('loaded config from ' + config_files[0]) else: config_dict = None # build return tuple conditional on presence of config if config_dict is not None: return_tup = spark, logger, config_dict else: return_tup = spark, logger return return_tup
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get the Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. NOTE - only the app_name argument will apply when this is called from a script sent to spark-submit (i.e. when __name__ = '__main__'). All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ if __name__ == '__main__': # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name)) else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: config_dict = None # build return tuple conditional on presence of config if config_dict is not None: return_tup = spark_sess, spark_logger, config_dict else: return_tup = spark_sess, spark_logger return return_tup
def main(): # start connection # configure spark instance to default global s_context global Logger global mylogger global s_context config = SparkConf() config.setAppName("Gait-Realtime-Analysis") s_context = SparkContext(conf=config) s_context.setLogLevel("ERROR") sys.path.insert(0, SparkFiles.getRootDirectory()) s_context.addFile('./model/cnn_modell.h5') s_context.addFile("./data_transformation.py") # TODO: add logger to spark # use spark context to create the stream context # 5 seconds ensure that we get two overlapping samples of 4 seconds interval_seconds = 10 s_stream_context = pss.StreamingContext(s_context, interval_seconds) s_stream_context.checkpoint("checkpoint_TSA") # with tf.gfile.GFile('./frozenInferenceGraphIdentification.pb', "rb") as f: # model_data = f.read() # model_data_bc = s_context.broadcast(model_data) # model_data_bc = s_context.broadcast(loaded_model) # connect to port 9009 i.e. twitter-client print(API_SERVICE_URL + ' ' + SPARK_SOCKET_PORT) socket_ts = s_stream_context.socketTextStream(API_SERVICE_URL, int(SPARK_SOCKET_PORT)) print("\n################################\n") line = socket_ts.flatMap(lambda line: line.split("\n")) gait = line.map(lambda g: (getUserId(g).strip(), g.strip())) gaitByUserId = gait.groupByKey() sortedGaitByUserId = gaitByUserId.transform( lambda foo: foo.sortBy(lambda x: (x[0]))) # sortedGaitByUserId = gaitByUserId.sortByKey() # author_counts_sorted_dstream = author_counts.transform(\ # (lambda foo:foo\ # .sortBy(lambda x:( -x[1])) ) # ) # author_counts_sorted_dstream.pprint() # sortedGaitByUserId.foreachRDD(another) segmentedData = sortedGaitByUserId.mapPartitions(partition_mapper_func) # x = cogrouped.mapValues(iterate) # for e in x.collect(): # print (e) # segmentedData.pprint() # DO NOT CHANGE THE LOCATION OF THIS FUNCTION def infer(data_rdd): # print("ATTEMPTING DEEP LEARNING") try: datas = data_rdd.collect() if len(datas) > 0: # print("INSIDE TRY BEFORE WITH") # with tf.Graph().as_default() as graph: # graph_def = tf.GraphDef() # graph_def.ParseFromString(model_data_bc.value) # tf.import_graph_def(graph_def, name="prefix") # print("INSIDE TRY AFTER WITH") # x = graph.get_tensor_by_name('prefix/Placeholder:0') # y = graph.get_tensor_by_name('prefix/Softmax:0') for data in datas: for id_xyz in data: if id_xyz: id = id_xyz[0] dummy_axis = "0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00" input_signals = [] input_signals.extend(id_xyz[1:]) for i in range(3): input_signals.append(dummy_axis) X_signals = [] for each in input_signals: X_signals.append([ np.array(cell, dtype=np.float32) for cell in [each.strip().split(' ')] ]) X_test = np.transpose(np.array(X_signals), (1, 2, 0)) from pyspark import SparkFiles from tensorflow.keras.models import load_model path = SparkFiles.get('cnn_modell.h5') model = load_model(path) print("Loaded model from disk") preds = model.predict(X_test) for p in preds: inferred_user_id = str(np.argmax(p) + 1) results = { 'confidency': str(np.amax(p)), 'inferred_user_id': inferred_user_id, 'actual_user_id': str(id) } print(results) requests.post(back_end_url, json=results) # with tf.Session(graph=graph) as sess: # y_out = sess.run(y, feed_dict={ # x: X_test # }) # for each in y_out: # inferred_user_id = str(np.argmax(each) + 1) # confidency = str(np.amax(each)) # actual_user_id = str(id) # results = {'confidency': confidency, 'inferred_user_id': inferred_user_id, # 'actual_user_id': actual_user_id} # print(results) # requests.post(back_end_url, json=results) except: e = sys.exc_info() print("Error: %s" % e) print('infer:', 'running inference on segmented data') segmentedData.foreachRDD(infer) # start the streaming computation s_stream_context.start() try: # wait for the streaming to finish s_stream_context.awaitTermination() except KeyboardInterrupt: print("\nSpark shutting down\n")
from pyspark import SparkConf, SparkFiles from pyspark.sql import SparkSession import time import os import sys conf = SparkConf().setAppName("test") spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate() print("============") for p, s, d in os.walk("/opt/spark/work-dir/"): print(f"{d}, {s}, {d}") print("============") currentDirectory = os.getcwd() print(currentDirectory) for p, s, d in os.walk(currentDirectory): print(f"{d}, {s}, {d}") os.chdir(SparkFiles.getRootDirectory()) sys.path.append("lib.zip") import test_spark.unit_a as unit_a unit_a.hello() with open("data.txt", 'r') as fh: print(fh.read()) #time.sleep(300) spark.stop()
def getSparkSession(self, files=[], jars=[], spark_config={}): """ setting spark config in code will take precendence over setting them in spark-submit https://spark.apache.org/docs/latest/submitting-applications.html#loading-configuration-from-a-file :param app_name: :param master: :param jars: :param files: :param spark_config: :return: """ src_jars_path = os.path.join(self.root_dir, 'jars') src_jars = [ os.path.join(src_jars_path, f) for f in os.listdir(src_jars_path) ] jars = ','.join(src_jars + jars) src_files_path = os.path.join(self.root_dir, 'files') src_files = [ os.path.join(src_files_path, f) for f in os.listdir(src_files_path) ] files = ','.join(src_files) spark_builder = ( SparkSession.builder # .master(master) # .appName(app_name) # .config('spark.executor.memory', '1g') # .config("spark.driver.memory", "1g") # .config("spark.memory.offHeap.size", "1g") .config("spark.jars", jars).config('spark.files', files) # .config("spark.hadoop.fs.s3a.endpoint", 'http://xxxx') # .config("spark.hadoop.fs.s3a.access.key", os.environ.get('MINIO_ACCESS_KEY_ID')) # .config("spark.hadoop.fs.s3a.secret.key", os.environ.get('MINIO_SECRET_ACCESS_KEY')) .config("spark.local.dir", self.temp_dir)) for key, val in spark_config.items(): spark_builder.config(key, val) spark = spark_builder.getOrCreate() spark_files_dir = SparkFiles.getRootDirectory() config_files = [ os.path.join(spark_files_dir, filename) for filename in os.listdir(spark_files_dir) if filename.endswith('config.json') ][0] with open(config_files, 'r') as f: config_map = json.load(f) for key, val in config_map.items(): spark.conf.set(key, val) print(spark.sparkContext.getConf().getAll()) print(spark.conf.get("spark.cassandra.connection.host")) print(spark.conf.get("spark.hadoop.fs.s3a.access.key")) print(spark.conf) spark._jsc.had return spark
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = not(hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name)) else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging_utils.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def _get_or_create_tmp_dir(): root_dir = SparkFiles.getRootDirectory() xgb_tmp_dir = os.path.join(root_dir, "xgboost-tmp") if not os.path.exists(xgb_tmp_dir): os.makedirs(xgb_tmp_dir) return xgb_tmp_dir
def start_spark(app_name='hello_fresh_etl_job', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = False if hasattr(__main__, '__file__') else True flag_debug = True if 'DEBUG' in environ.keys() else False if not (flag_repl or flag_debug): # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) environment = 'local' else: # get Spark session factory spark_builder = (SparkSession.builder.master(master).appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: try: with open('configs/etl_config.json', 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) except FileNotFoundError: config_dict = None return spark_sess, spark_logger, config_dict, environment
def __ls(broadcast_vars, iterator): """ Get the list of files in the worker-local directory """ return [__get_hostname(), os.listdir(SparkFiles.getRootDirectory())]
from pyspark import SparkContext import sys sc = SparkContext.getOrCreate(); spark=SparkSession.builder \ .master("<spark_master_node_IP>") \ .appName("Image_IP_Protect") \ .getOrCreate() #Custom Modules sc.addFile("import_img.py") sc.addFile("hashing.py") from pyspark import SparkConf, SparkContext, SparkFiles sys.path.insert(0,SparkFiles.getRootDirectory()) import import_img as impg import hashing as hs #Pyspark Packages from pyspark.sql import functions as F from pyspark.sql import types as T from pyspark.ml.linalg import SparseVector, DenseVector, Vectors from pyspark.sql.functions import col from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH from pyspark.ml.clustering import KMeans, BisectingKMeans #Python Packages import numpy as np from PIL import Image
from pyspark import SparkConf, SparkContext, SparkFiles import sys # Initiate SparkSession spark = SparkSession\ .builder\ .appName("PythonPi")\ .getOrCreate() spark.sparkContext.addPyFile("/mnt/mesos/sandbox/sparkling-water-2.2.16.zip") sys.path.insert( 0, '/mnt/mesos/sandbox/sparkling-water-2.2.16/py/build/dist/h2o_pysparkling_2.2-2.2.16.zip' ) print(SparkFiles.getRootDirectory()) print(help('modules')) print('\n'.join(sys.path)) import h2o from pysparkling import * # Initiate H2OContext hc = H2OContext.getOrCreate(spark) pFile = spark.read.orc("s3a://dfs-lab13-ace/testLarge3/orctestdata/") h2oFrame = hc.asH2OFrame(pFile) # Stop H2O and Spark services h2o.cluster().shutdown() spark.stop()
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get the Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. NOTE - only the app_name argument will apply when this is called from a script sent to spark-submit (i.e. when __name__ = '__main__'). All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ if __name__ == '__main__': # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name)) else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: config_dict = None # build return tuple conditional on presence of config if config_dict is not None: return_tup = spark_sess, spark_logger, config_dict else: return_tup = spark_sess, spark_logger return return_tup
def __ls(broadcast_vars, iterator): """ Get the list of files in the worker-local directory """ return [__get_hostname(), os.listdir(SparkFiles.getRootDirectory())]