def word_count_repartition_n(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=[ 'run_app.py', 'repartition_scripts.py', 'wordCountConfig.py' ]) data = sc.textFile(data_file) repartitionData = data.repartition( int( int(sc.getConf().get("spark.executor.cores")) * int(sc.getConf().get("spark.executor.instances")) * float(internal_param[7]))) words = repartitionData.flatMap(mymapeo) print(str(int(data.getNumPartitions()))) print(str(int(words.getNumPartitions()))) print('NUM WORDS PARTITIONS ' + str( int( int(sc.getConf().get("spark.executor.cores")) * int(sc.getConf().get("spark.executor.instances")) * float(internal_param[7])))) frequencies = words.filter(lambda x: x != '').map( lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) print('Words frequencies:', frequencies.take(5)) print('NUM FREQUENCIES PARTITIONS ' + str(int(frequencies.getNumPartitions()))) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def spark_session(request): conf = SparkConf() conf.set("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector" "-hadoop2-2.0.1.jar") conf.set("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.13.1-beta") sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") sc._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") sc._jsc.hadoopConfiguration().set( "google.cloud.auth.service.account.enable", "true") sa_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") if sa_path is not None: sc._jsc.hadoopConfiguration().set( "google.cloud.auth.service.account.json.keyfile", sa_path) spark = SparkSession.builder \ .config(conf=sc.getConf()) \ .getOrCreate() request.addfinalizer(lambda: spark.stop()) return spark
def main_stats(): spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__]) num_executor = int(spark_context.getConf().get("spark.executor.instances")) world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1], minPartitions=num_executor) tuplified_cities = tuplify_city(world_cities_file) print(cities_stats(tuplified_cities))
def spark(request): conf = SparkConf() conf.set( 'spark.jars', 'https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector' '-hadoop2-2.0.1.jar') conf.set( 'spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.19.1') conf.set('spark.driver.host', '127.0.0.1') sc = SparkContext(master='local', conf=conf) sc._jsc.hadoopConfiguration().set( 'fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem') sc._jsc.hadoopConfiguration().set( 'fs.AbstractFileSystem.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS') sc._jsc.hadoopConfiguration().set( 'google.cloud.auth.service.account.enable', 'true') sa_path = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') if sa_path is not None: sc._jsc.hadoopConfiguration().set( 'google.cloud.auth.service.account.json.keyfile', sa_path) spark = SparkSession.builder \ .config(conf=sc.getConf()) \ .getOrCreate() request.addfinalizer(lambda: spark.stop()) return spark
def main_hist(): spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__]) num_executor = int(spark_context.getConf().get("spark.executor.instances")) world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1], minPartitions=num_executor) histogram_rdd = histogram(world_cities_file) for line in histogram_rdd.take(10): print(line)
def main_tuple_2(): spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__]) num_executor = int(spark_context.getConf().get("spark.executor.instances")) world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1], minPartitions=num_executor) tuple_2 = tuplify_city(world_cities_file) for line in tuple_2.take(10): print(line)
def test_create_spark_context_first_then_spark_session(self): sc = None session = None try: conf = SparkConf().set("key1", "value1") sc = SparkContext('local[4]', "SessionBuilderTests", conf=conf) session = SparkSession.builder.config("key2", "value2").getOrCreate() self.assertEqual(session.conf.get("key1"), "value1") self.assertEqual(session.conf.get("key2"), "value2") self.assertEqual(session.sparkContext, sc) self.assertFalse(sc.getConf().contains("key2")) self.assertEqual(sc.getConf().get("key1"), "value1") finally: if session is not None: session.stop() if sc is not None: sc.stop()
def test_create_spark_context_first_then_spark_session(self): sc = None session = None try: conf = SparkConf().set("key1", "value1") sc = SparkContext("local[4]", "SessionBuilderTests", conf=conf) session = SparkSession.builder.config("key2", "value2").getOrCreate() self.assertEqual(session.conf.get("key1"), "value1") self.assertEqual(session.conf.get("key2"), "value2") self.assertEqual(session.sparkContext, sc) self.assertFalse(sc.getConf().contains("key2")) self.assertEqual(sc.getConf().get("key1"), "value1") finally: if session is not None: session.stop() if sc is not None: sc.stop()
def main_join(): sc = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__]) num_executor = int(sc.getConf().get("spark.executor.instances")) world_cities_file = sc.textFile("hdfs://" + sys.argv[1], minPartitions=num_executor) region_codes_file = sc.textFile("hdfs://" + sys.argv[2], minPartitions=num_executor) joined_rdd = join(world_cities_file, region_codes_file) for line in joined_rdd.take(10): print(line)
def spark_conf(): """ This function creates initialize spark objects :return: spark object """ conf = SparkConf() sc = SparkContext(conf=conf) sqlc = SQLContext(sparkContext=sc) spark = SparkSession.builder.config(conf=sc.getConf()).enableHiveSupport().getOrCreate() return spark, sc, sqlc
def word_count_sort_repartition_n(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=[ 'run_app_small.py', 'run_app.py', 'sesgo_scripts.py', 'persist_scripts.py', 'repartition_scripts.py', 'config_scripts.py', 'wordCountConfig.py' ]) data = sc.textFile(data_file) repartitionData = data.repartition( int( int(sc.getConf().get("spark.executor.cores")) * int(sc.getConf().get("spark.executor.instances")) * float(internal_param[7]))) words = repartitionData.flatMap(mymapeo) print(str(int(data.getNumPartitions()))) print(str(int(words.getNumPartitions()))) print('NUM WORDS PARTITIONS ' + str( int( int(sc.getConf().get("spark.executor.cores")) * int(sc.getConf().get("spark.executor.instances")) * float(internal_param[7])))) frequencies = words.filter(lambda x: x != '').map( lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) repartfrequencies = frequencies.repartition( int( int(sc.getConf().get("spark.executor.cores")) * int(sc.getConf().get("spark.executor.instances")) * float(internal_param[7]))) numWords = data.count() sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False) topFreqs = sortFreq.take(5) print('Number of words: ', numWords) print('Words frequencies:', sortFreq.take(5)) print('Top 5 frequencies:', topFreqs) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def word_count_sort_pesist_disk_only(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=['run_app.py', 'config_scriptsDf.py']) spark = SparkSession.builder.config(conf=conf).getOrCreate() data = sc.textFile(data_file).flatMap(lambda x: x.split(" ")).collect() paralData = sc.parallelize(data, 100) print(paralData.getNumPartitions()) print(sc.getConf().get("spark.executor.instances")) df = paralData.map(lambda r: Row(r)).toDF(["word"]) df.show() cleanDf = df.filter(col('word') != '').withColumn( 'word', regexp_replace(col('word'), '[^\sa-zA-Z0-9]', '')) result = cleanDf.withColumn( 'count', lit(1)).groupBy('word').sum('count').withColumnRenamed( 'sum(count)', 'frequencies') result.show() numWords = data.count() sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False) topFreqs = sortFreq.take(5) print('Number of words: ', numWords) print('Words frequencies:', sortFreq.collect()) print('Top 5 frequencies:', topFreqs) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def build_spark_session( app_name: str, spark_config: DefaultDict[str, str] = None, hadoop_config: DefaultDict[str, str] = None) -> SparkSession: conf = SparkConf() if spark_config: conf.setAll(spark_config.items()) sc = SparkContext(conf=conf) if hadoop_config: for k, v in hadoop_config.items(): sc._jsc.hadoopConfiguration().set(k, v) return SparkSession.builder \ .appName(app_name) \ .config(conf=sc.getConf()) \ .getOrCreate()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--review-file", help="The input file of reviews") parser.add_argument("--movie-file", help="The input file of movies") parser.add_argument("--task-2", action="store_true", help="For task 2") parser.add_argument("--K", "-k", help="Number of top K words") parser.add_argument("--t") parser.add_argument("--tuning", action="store_true") args = parser.parse_args() review_file = args.review_file movie_file = args.movie_file is_task_2 = args.task_2 k = args.K or 10 k = int(k) t = args.t tuning = args.tuning threads = [] if tuning: threads = [1, 2, 3, 5, 8, 13] else: threads = [t if t else 1] for thread in threads: master = 'local[{}]'.format(thread) conf = SparkConf().setMaster(master).setAppName("MovieRanking") spark_context = SparkContext(conf=conf) print('spark conf: {}'.format(str(spark_context.getConf().getAll()))) time_start = datetime.datetime.now() movies_tuple = generate_rdds(movie_file, review_file, spark_context) if is_task_2: complete_task_2(movies_tuple) else: complete_task_1(movies_tuple, k) time_end = datetime.datetime.now() print('threads:{}, duration: {}'.format(thread, time_end - time_start)) # SparkContext.stop(spark_context) spark_context.stop()
def main(): #parse command line options (options,args)=parseOptions() if len(args) != 2: raise Exception("need an input file and an output path") #set number of file partitions/parallelism if options.numPartitions==None: #pick number of partitions based on default amount of parallelism and filesize partFactor=1#how many times the default parallelism. Defaul Parallelism is #related to the number of cores on the machine. numPartitions=sc.defaultParallelism*partFactor else: numPartitions=options.numPartitions conf=SparkConf().setAppName("wordCount").setMaster("local["+str(numPartitions)+"]") sc = SparkContext(conf=conf) conf=sc.getConf() print("conf="+str(conf.getAll())) print("defaultMinPartitions="+str(sc.defaultMinPartitions)) print("defaultParallelism="+str(sc.defaultParallelism)) inputFileName = args[0] outputFileName= args[1] timeStart=time.time() file = sc.textFile(inputFileName,minPartitions=numPartitions) counts = file.count() timeEnd=time.time() dtRead=timeEnd-timeStart#time in seconds #write out to a file timeStart=time.time() file.saveAsTextFile(outputFileName) timeEnd=time.time() dtWrite=timeEnd-timeStart#time in seconds print("read+count time="+str(dtRead)+" s") print("write time="+str(dtWrite)+" s") print("number of lines="+str(counts)) print("num Partitions="+str(file.getNumPartitions()))
def main(): database = "/home/manuelr/ths-client/app/app.db" conn = create_connection(database) sc = SparkContext(appName='Insert Tweets') spark = get_spark_session_instance(sc.getConf()) cur = conn.cursor() result = cur.execute( ''' SELECT MAX(date_modified), date_created FROM tweets ''').fetchone( ) if result[1] is None: max_date = '2018-10-01 00:00:00.000' else: max_date = result[1] spark.sql('use thsfulltext') df = spark.sql('select twitter_id, full_text, inserted_tweet from tweet') df = df.filter( df.inserted_tweet.between(str(max_date), str( datetime.today()))).orderBy(df.inserted_tweet.asc()) tweets = df.collect() count = len(tweets) sql_select = ''' SELECT tweet_id FROM tweets WHERE tweet_id = ?''' limit = 0 index = 0 with conn: while limit < 5000: while index < count and cur.execute( sql_select, [str(tweets[index].twitter_id)]).fetchone() is not None: index = index + 1 print('tweet already inserted') if index >= count: print('There are no more tweets to insert') break insert_tweet(conn, tweets[index]) limit = limit + 1 index = index + 1
.master("yarn") \ .appName("TimingLimit") \ .getOrCreate() order_info = spark.table('dev.dev_lgb_fullStock_TimingLimit_order_info') waybill_info = spark.table('dev.dev_lgb_fullStock_TimingLimit_waybill_info') # dev_lgb_fullStock_TimingLimit_order_info # dev_lgb_fullStock_TimingLimit_waybill_info # 二、SparkSession 部分 rdd 部分 def rmDirFiles(rm_path): ''' saveAsTextFile 方法不能覆盖文件夹,需要先删除。 ''' rm_cmd = 'hadoop fs -rm -r {0}'.format(rm_path) try: os.system(rm_cmd) except: print '[ {0} ] the path has already been removed !'.format(rm_cmd) save_path = r'hdfs://ns15/user/cmo_ipc/longguangbin/work/pytest' sc = SparkContext(master="yarn", appName="My App") sc_conf = map(lambda x: x[0] + ':' + x[1], sc.getConf().getAll()) rmDirFiles(save_path + os.sep + 'sc_conf') sc.parallelize(sc_conf).repartition(1).saveAsTextFile(save_path + os.sep + 'sc_conf')
from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import * sc = SparkContext() spark = SparkSession.builder.appName( "Python Spark SQL basic example").getOrCreate() # Define the address of the PMI server and the number of MPI workers hostname = os.uname()[1] hydra_proxy_port = os.getenv("HYDRA_PROXY_PORT") pmi_port = hostname + ":" + hydra_proxy_port sc.getConf().set('mpi', pmi_port) targetPartition = 4 sizeX = 100 sizeY = 216 sizeZ = 261 sliceSize = sizeX * sizeY maxIndex = 100 * 216 * 261 def getPartition(value): count = 0 threshold = 0 for z in zSizes:
from pyspark.sql import SQLContext from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from utils import * from pyspark import SparkContext, SparkConf file_path = "./data.csv" checkpoint_dir = "./CheckpointDir/" conf = SparkConf().setAppName("Car Price Prediction").setMaster("local[*]") sc = SparkContext(conf=conf) print(sc.getConf().getAll()) sc.setCheckpointDir(checkpoint_dir) spark = SQLContext(sc) data = spark.read.csv(path=file_path, header=True, quote='"', sep=",", inferSchema=True) data_test, data_train = data.randomSplit(weights=[0.3, 0.7], seed=10) get_indexer_input = get_indexer_input(data) def model_training(data_train, indexer_input): x_cols = list( set(data_train.columns) - set(indexer_input.keys() + ["Price"])) str_ind_cols = ['indexed_' + column for column in indexer_input.keys()] indexers = indexer_input.values() pipeline_tr = Pipeline(stages=indexers) data_tr = pipeline_tr.fit(data_train).transform(data_train)
def main(): sc = SparkContext() spark = SparkSession.builder.appName("SimpleApp").getOrCreate() print('*'*60, '\n', sc.getConf().getAll(), '\n', '*'*60, '\n') #Prints configuration at start of run on EMR strip_chars = ".?,!;:\"/>\\'()#&" rgx = sc.broadcast(re.compile('[%s]' % strip_chars)) def process_str(row): """ Input: text row from dataframe Output: list of words with punctuation removed Note that this must be declared in main for proper function """ body_list = [] try: for word in row.lower().split(): word = rgx.value.sub('', word) body_list.append(word) return body_list except Exception as e: print(e) print(row) return [''] #Declaration of 'user defined function' so nodes can use them process = udf(process_str, ArrayType(StringType())) good_bad = udf(good_bad_filter, IntegerType()) #Directory of reviews: s3://amazon-reviews-pds/tsv/ #The use of wildcards (*_us_*.gz) allows spark to load all but the non-english reviews full_df = spark.read.csv('s3://amazon-reviews-pds/tsv/*_us_*.gz', sep="\t", header=True, inferSchema=True) #full_df_in = spark.read.csv('s3://amazon-reviews-pds/tsv/amazon_reviews_us_Video_v1_00.tsv.gz', sep="\t", header=True, inferSchema=True) #Repartitioning the Dataframe allows each task to be split to the workers repartition_num = 1000 full_df = full_df.repartition(repartition_num) #Filters out 3 star ratings, and only keeps the review_headline, review_body, and star_rating columns #The good_bad function makes 4 and above become 1 (positive review), and 2 and below become 0 (negative review) filtered_df = full_df.select('review_headline', 'review_body', 'star_rating')\ .filter(full_df.star_rating != 3)\ .withColumn('star_rating_filtered', good_bad('star_rating')) #Concatinates the review_headline and review_body columns and renames the column 'text' two_col_df = filtered_df.select(concat(col('review_headline'), lit(' '), col('review_body')).alias('text'), filtered_df.star_rating_filtered) #Turns string into a list of words with the punctuation removed text_list_df = two_col_df.withColumn('text_list', process(two_col_df['text']))\ .select('text_list', 'star_rating_filtered') #Fitting and transforming the dataset into a count vectorized form cv = CountVectorizer(inputCol="text_list", outputCol="count_vec", minDF=1000) cv_fit = cv.fit(text_list_df) #need to save vocabulary from this cv_transform = cv_fit.transform(text_list_df) #Creates output dataframe, and filters out all reviews that had an error with the star rating output_df = cv_transform.select(cv_transform.count_vec, cv_transform.star_rating_filtered)\ .filter(cv_transform.star_rating_filtered != 2) #Saves the vocabulary and processed dataframe to S3 in JSON format vocab = spark.createDataFrame(cv_fit.vocabulary, schema=StringType()) vocab.coalesce(1).write.mode("overwrite").json('s3://dsi-amazon-neural/complete_vocab_newest') output_df = output_df.repartition(1000) output_df.write.mode("overwrite").json('s3://dsi-amazon-neural/complete_data_newest') print('*'*50, '\n'*5, "positive reviews:", output_df.filter(output_df.star_rating_filtered == 1).count(), '*'*50) #138826230 posives recorded print('*'*50, '\n'*5, "negative reviews:", output_df.filter(output_df.star_rating_filtered == 0).count(), '*'*50)
'Error running command: %s. Return code: %d, Error: %s' % ( ' '.join(args_list), proc.returncode, errors)) return (output, errors) ''' ------------------------------------------------------ Intialalize Spark Context Google Translator and LOGGER ------------------------------------------------------ ''' translator = Translator() conf = SparkConf().setAppName("spark-test").setMaster("local") sc = SparkContext(conf=conf) spark = SparkSession.builder.config('spark.executor.memory', '1g').getOrCreate() LOG4JLOGGER = spark.sparkContext._jvm.org.apache.log4j LOGGER = LOG4JLOGGER.LogManager.getLogger(__name__) LOGGER.info(sc.getConf().getAll()) ''' --------------------------------------- LOAD CONFIG FILE --------------------------------------- ''' LOGGER.info("Following config passed to the Driver :") #ANUVAAD_INPUT_CONFIG= yaml.safe_load(open('config.yml')) ''' --------------------------------------- Get config values from yml file --------------------------------------- '''
# # sc.binaryFiles() # sc.binaryRecords() # sc.cancelAllJobs() # sc.cancelJobGroup(groupId) # sc.setJobGroup(groupId,"") # 将概要信息转储到目录路径中 # sc.dump_profiles(path) rdd = sc.emptyRDD() # 创建一个没有分区或元素的RDD。 print(sc.getConf()) # 返回SparkConf对象 # getLocalProperty(key) # Get a local property set in this thread, or null if it is missing. See setLocalProperty(). # classmethod getOrCreate(conf=None) """ sc.hadoopFile() sc.hadoopRDD() sheet = sc.newAPIHadoopFile( '/user/me/sample.txt', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': 'Time\tMHist'} )
def main(sc: SparkContext, input_path: str, output_path: str, is_parquet: bool = False, from_local: bool = False, to_hdfs: bool = False, is_sequence: bool = False, is_gzip: bool = False) -> None: if not from_local: input_path = os.path.join(Config.get_hdfs_basepath(), input_path) log.info(f'Loading data from: {input_path}') # Stage 1: Read user dataset # step 1: read files from hdfs user_path = os.path.join(input_path, 'users/*-r-00000') users = sc.wholeTextFiles(user_path) \ .map(lambda x: (os.path.basename(x[0]), x[1])) \ .filter(lambda x: x[0][0].isdigit()) # step 2: flatten data with user_id # format ('{user_id}-r-0000', 'movie_id,rating,date\n...') # -> (user_id, [movie_id, rating, date]) users = flatten_user_data(users) # use for sparse vector max_user_id = users \ .max(key=lambda x: x[0])[0] # Stage 2: Read movie data and flatten #movies = sc.textFile(os.path.join(input_path, 'movies/*')) #movies = flatten_movie_data(movies) # Stage 2: Build sparse vector of ratings movies = build_sparse_vector_with_ratings(users, max_user_id) # Stage 3: Build all combinations of movie pairs per user # format: [(movie1, movie2), # (SparseVector(movie1_rate), SparseVector(movie2_rate), num_rates1, num_rates2)] movie_pairs = build_item_pairs_by_key(movies) # Stage 4: Calculate correlations # format: [movie1, movie2, cosine] # TODO: add Pearson output = calc_similarity(movie_pairs) # Stage 6: Output the scores to a file in HDFS/Parquet if to_hdfs: output_path = os.path.join(Config.get_hdfs_basepath(), output_path) if is_parquet: from pyspark.sql import SparkSession from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import LongType, FloatType # TODO: add Pearson and Jaccard schema = StructType([ StructField('Movie1', LongType(), False), StructField('Movie2', LongType(), False), StructField('Cosine', FloatType(), True), #StructField('Pearson', FloatType(), True), StructField('Jaccard-bin', FloatType(), True), StructField('Jaccard', FloatType(), True) ]) session = SparkSession.builder.config(conf=sc.getConf()) \ .getOrCreate() output = output \ .map(lambda x: [*x[0], *map(float, x[1])]) output_path = os.path.join(output_path, 'result.parquet') session.createDataFrame(output, schema=schema) \ .write \ .mode('overwrite') \ .parquet(output_path) else: # write as simple text with Writable from Hadoop API labels = ['Cosine', 'Jaccard-bin', 'Jaccard'] if is_gzip: compClass = 'org.apache.hadoop.io.compress.GzipCodec' else: compClass = None output = output.map(lambda x: (convert_to_writable_format(x[ 0]), convert_to_writable_format(x[1], labels))) if to_hdfs: if is_sequence: output.saveAsNewAPIHadoopFile( output_path, outputFormatClass= 'org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat', keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') else: output.saveAsTextFile(output_path, compressionCodecClass=compClass) else: output.saveAsTextFile(output_path, compressionCodecClass=compClass) log.info(f'Saved at: {output_path}')
class EpidataContext: """ A connection to the epidata server, and all relevant context. """ def __init__(self): spark_conf = SparkConf() self._sc = SparkContext(os.environ['SPARK_MASTER'], 'epidata', conf=spark_conf) # get epidata spark conf conf = self._sc.getConf() cassandra_user = conf.get('spark.cassandra.auth.username', 'cassandra') cassandra_pass = conf.get('spark.cassandra.auth.password', 'epidata') cassandra_host = conf.get('spark.cassandra.connection.host', '127.0.0.1') cassandra_keyspace = conf.get('spark.epidata.cassandraKeyspaceName', 'epidata_development') kafka_brokers = conf.get('spark.epidata.kafka.brokers', 'localhost:9092') kafka_batch_duration = int( conf.get('spark.epidata.kafka.duration', '6')) self._measurement_class = conf.get('spark.epidata.measurementClass', 'sensor_measurement') java_import(self._sc._jvm, "com.epidata.spark.EpidataContext") self._jec = self._sc._jvm.EpidataContext(self._sc._jsc) self._sql_ctx = SQLContext(self._sc, self._jec.getSQLContext()) self._sql_ctx_pyspark = SQLContext(self._sc) self._cassandra_conf = { 'keyspace': cassandra_keyspace, 'user': cassandra_user, 'password': cassandra_pass } self._has_checked_memory = False self._kafka_broker = os.environ.get('KAFKA_BROKER', kafka_brokers) self._batch_duration = kafka_batch_duration self._ssc = StreamingContext(self._sc, self._batch_duration) def query_measurements_original(self, field_query, begin_time, end_time): """ Query for epidata measurements. Parameters ---------- field_query : dictionary containing either strings or lists of strings A dictionary containing field names and the values those fields must contain in matching measurements. Some system configurations require that values of specific fields be specified. A string field value represents an equality match, while a list value represents set membership (all values within the set are matched). begin_time : datetime Beginning of the time interval to query, inclusive. end_time : datetime End of the time interval to query, exclusive. Returns ------- result : epidata DataFrame A DataFrame containing measurements matching the query. """ self._check_cluster_memory() java_field_query, java_begin_time, java_end_time = self._to_java_params( field_query, begin_time, end_time) java_data_frame = self._jec.query(java_field_query, java_begin_time, java_end_time) return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx) def query_measurements_cleansed(self, field_query, begin_time, end_time): self._check_cluster_memory() java_field_query, java_begin_time, java_end_time = self._to_java_params( field_query, begin_time, end_time) java_data_frame = self._jec.queryMeasurementCleansed( java_field_query, java_begin_time, java_end_time) return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx) def query_measurements_summary(self, field_query, begin_time, end_time): self._check_cluster_memory() java_field_query, java_begin_time, java_end_time = self._to_java_params( field_query, begin_time, end_time) java_data_frame = self._jec.queryMeasurementSummary( java_field_query, java_begin_time, java_end_time) return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx) def create_stream(self, ops, original="measurements", clean_up=True): esc = EpidataStreamingContext(self._sc, self._ssc, self._sql_ctx_pyspark, original, self._kafka_broker, self._cassandra_conf, self._measurement_class) esc.run_stream(ops, clean_up) def start_streaming(self): def _start(): self._ssc.start() self._ssc.awaitTermination() thread = Thread(target=_start) thread.start() def stop_streaming(self): self._ssc.stop(False, True) self._ssc = StreamingContext(self._sc, self._batch_duration) def create_transformation(self, func, args=[], destination="measurements_cleansed"): cassandra_tables = [ 'measurements', 'measurements_original', 'measurements_raw', 'measurements_cleansed', 'measurements_processed', 'measurements_summary', 'measurements_aggregates' ] datastore = "cassandra" if destination in cassandra_tables else "kafka" return Transformation(func, args, destination, datastore) def list_keys(self): """ List the epidata measurement keys. Returns ------- result : epidata DataFrame A DataFrame containing values of the principal fields used for classifying measurements. """ self._check_cluster_memory() return DataFrame(jdf=self._jec.listKeys(), sql_ctx=self._sql_ctx) def _check_cluster_memory(self): if self._has_checked_memory: return try: spark_ip = re.match('spark://(.*):\d+', os.environ['SPARK_MASTER']).group(1) clusterStatus = json.loads( urllib.urlopen('http://' + spark_ip + ':18080/json').read()) if clusterStatus['memory'] - clusterStatus['memoryused'] < 3 * 512: raise MemoryError('All cluster memory resources are in use.') except MemoryError: raise except Exception as e: print e pass self._has_checked_memory = True def _to_java_params(self, field_query, begin_time, end_time): gc = self._sc._gateway._gateway_client def to_java_list(x): if isinstance(x, basestring): return ListConverter().convert([x], gc) return ListConverter().convert(x, gc) java_list_field_query = { k: to_java_list(v) for k, v in field_query.items() } java_field_query = MapConverter().convert(java_list_field_query, gc) java_begin_time = self._to_java_timestamp(begin_time) java_end_time = self._to_java_timestamp(end_time) return java_field_query, java_begin_time, java_end_time def _to_java_timestamp(self, dt): ts = long(time.mktime(dt.timetuple()) * 1e3 + dt.microsecond / 1e3) return self._sc._jvm.java.sql.Timestamp(ts)
def main(): sc = SparkContext('local[15]', 'haha') # sc._conf.set("spark.python.profile", "true") print(sc.getConf().getAll()) d = load(sc) data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf'] data_train_p, label_train_gt = d['train_tfidf'], d['train_gt'] data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw'] data_train_lp = data_train_lp.sample(False, 0.01) # print(sum(data_train_lp.first()[0])) # print(data_train_lp.zipWithIndex().collect()) print(data_train_lp.take(2)) print("___________train_bayes_____________") sys.stdout.flush() nb = NaiveBayes.train(data_train_lp) print("___________trained_bayes___________") sys.stdout.flush() # nb.save(sc, 'bayes.model') bayes_result_dev = nb.predict(data_dev_p).map(int) bayes_result_dev.count() bayes_result_train = nb.predict(data_train_p).map(int) bayes_result_train.count() bayes_result_test = nb.predict(test_p).map(int) bayes_result_test.count() print("train info:") valid(bayes_result_train, label_train_gt) print("dev info:") valid(bayes_result_dev, label_dev_gt) print("___________train_logistic_____________") sys.stdout.flush() lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005) print("___________trained_logisitc___________") sys.stdout.flush() # lg.save(sc, 'logistic.model') logistic_result_dev = lg.predict(data_dev_p).map(int) logistic_result_train = lg.predict(data_train_p).map(int) logistic_result_test = lg.predict(test_p).map(int) print("train info:") valid(logistic_result_train, label_train_gt) print("dev info:") valid(logistic_result_dev, label_dev_gt) fused_train_p = stack_label([bayes_result_train, logistic_result_train]) fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev]) fused_test_p = stack_label([bayes_result_test, logistic_result_test]) fused_train_lp = label(data_train, fused_train_p) print("___________train_GBDT___________") sys.stdout.flush() gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {}) print('___________trained_GBDT_________') sys.stdout.flush() fused_result_train = gbdt.predict(fused_train_p) fused_result_dev = gbdt.predict(fused_dev_p) fused_result_test = gbdt.predict(fused_test_p) print("train info:") valid(fused_result_train, label_train_gt) print("dev info:") valid(fused_result_dev, label_dev_gt) dump(fused_result_test.map(int).collect()) sc.show_profiles()
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession conf = SparkConf().setAppName("HelloPython").setMaster("local") sc = SparkContext(conf=conf) print sc.getConf().getAll() sc.stop() spark = SparkSession.builder.appName("spark").getOrCreate() print spark.sparkContext.getConf().getAll() spark.sparkContext.stop()
conf=(SparkConf() .setMaster("yarn") .setAppName("Stack Overflow Test") .set("spark.executor.memory","2g")) sc=SparkContext(conf=conf) from pyspark import SparkContext SparkContext.setSystemProperty('spark.executor.memory','2g') from pyspark import SparkContext sc=SparkContext("yarn","StackOverflowTest", py-Files=['sotest.py','lib.zip']) # Run in terminal # spark2-submit --executor-memory 4g stackoverflowtest.py sc.getConf().getAll() [(u'spark.driver.host', u'10.0.2.104'), (u'spark.eventLog.enabled', u'true'), (u'spark.ui.proxyBase', u'/proxy/application_1511794877761_0014'), (u'spark.driver.extraLibraryPath', u'/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/hadoop/lib/native'), from pyspark.conf import SparkConf SparkSession.builder.config(conf=SparkConf()) SparkSession.builder.config('spark.eventLog.dir', '/stackexchange/logs') spark=SparkSession.builder \ .master('yarn') \ .appName('StackOverflowTest') \ .config('spark.executor.memory', '2g') \ .getOrCreate() spark=SparkSession.builder \ .master('yarn') \
'C': C, 'T': T, 'distances': distances, 'iterations': iterations } def lowest_mean_distance_reduce(x, y): mean_distance_x = np.mean(x['distances']) mean_distance_y = np.mean(y['distances']) if mean_distance_x > mean_distance_y: return y else: return x if __name__ == "__main__": sc = SparkContext(appName="Spark ICP room finder") spark = SparkSession.builder.appName("Spark ICP room finder").getOrCreate() num_of_executors = int(sc.getConf().get("spark.executor.instances")) int_rdd = sc.parallelize(range(num_of_executors)) int_rdd.map(lambda x: import_packages(x)).collect() A_ply = sys.argv[1] room_paths = [] for filename in os.listdir(sys.argv[2]): room_paths.append(filename) dist_room_paths = sc.parallelize(room_paths) most_probable_room = dist_room_paths.map(run_icp_map).reduce( lowest_mean_distance_reduce) print(most_probable_room)
def _pr_spark_conf(sc: SparkContext): pr_red('Number of Works: ' + str(sc.defaultParallelism)) pr_red('Driver Memory: ' + sc.getConf().get("spark.driver.memory")) pr_red('Maximum Result Size: ' + sc.getConf().get("spark.driver.maxResultSize"))
import os from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import StringType from pyspark.sql.functions import trim, col, lit, regexp_replace, udf from functools import reduce from aut import * import time import argparse conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) WB_PATH = "/dfs/dataset/wb" print(sc.getConf().getAll()) def timer(task, start, end): hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) time_text = "[{}]{:0>2}:{:0>2}:{:05.2f}\n".format(task, int(hours), int(minutes), seconds) print_and_log(time_text) def print_and_log(text): print(text) write_log(text)
from pyspark import SparkContext, SparkConf import math from pyspark.mllib.recommendation import ALS conf = SparkConf().setAll([('spark.executor.memory', '20g'), ('spark.executor.cores', '4'), ('spark.cores.max', '24'), ('spark.driver.memory', '30g'), ('spark.driver.cores', '4'), ('spark.executor.instances', '4')]) sc = SparkContext(conf=conf) sc.getConf().getAll() raw_data = sc.textFile("CHANGEME") data = raw_data.map(lambda line: line.split(',')).map( lambda tokens: (tokens[0], tokens[1], tokens[2])).cache() #split dataset training_RDD, validation_RDD, test_RDD = data.randomSplit([6, 2, 2]) validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])) test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1])) #training parameters iterations = 10 regularization_parameter = 0.1 ranks = [4, 8, 12] errors = [0, 0, 0] err = 0 tolerance = 0.02 #training
def __init__(self, spark_context: SparkContext): conf: SparkConf = spark_context.getConf() log4j = spark_context._jvm.org.apache.log4j message_prefix = f"<{conf.get('spark.app.id')} {conf.get('spark.app.name')}>" self._log4j_logger = log4j.LogManager.getLogger(message_prefix)