示例#1
0
def word_count_repartition_n(internal_param, data_file):
    try:

        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])
        sc = SparkContext(conf=conf,
                          pyFiles=[
                              'run_app.py', 'repartition_scripts.py',
                              'wordCountConfig.py'
                          ])

        data = sc.textFile(data_file)

        repartitionData = data.repartition(
            int(
                int(sc.getConf().get("spark.executor.cores")) *
                int(sc.getConf().get("spark.executor.instances")) *
                float(internal_param[7])))

        words = repartitionData.flatMap(mymapeo)

        print(str(int(data.getNumPartitions())))
        print(str(int(words.getNumPartitions())))

        print('NUM WORDS PARTITIONS ' + str(
            int(
                int(sc.getConf().get("spark.executor.cores")) *
                int(sc.getConf().get("spark.executor.instances")) *
                float(internal_param[7]))))

        frequencies = words.filter(lambda x: x != '').map(
            lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
        print('Words frequencies:', frequencies.take(5))

        print('NUM FREQUENCIES PARTITIONS ' +
              str(int(frequencies.getNumPartitions())))

        app_id = sc.applicationId

        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
示例#2
0
def spark_session(request):
    conf = SparkConf()
    conf.set("spark.jars",
             "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector"
             "-hadoop2-2.0.1.jar")
    conf.set("spark.jars.packages",
             "com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.13.1-beta")
    sc = SparkContext(conf=conf)

    sc._jsc.hadoopConfiguration().set("fs.gs.impl",
                                      "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    sc._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl",
                                      "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    sc._jsc.hadoopConfiguration().set(
        "google.cloud.auth.service.account.enable", "true")

    sa_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    if sa_path is not None:
        sc._jsc.hadoopConfiguration().set(
            "google.cloud.auth.service.account.json.keyfile",
            sa_path)

    spark = SparkSession.builder \
        .config(conf=sc.getConf()) \
        .getOrCreate()

    request.addfinalizer(lambda: spark.stop())

    return spark
示例#3
0
def main_stats():
    spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__])
    num_executor = int(spark_context.getConf().get("spark.executor.instances"))
    world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1],
                                               minPartitions=num_executor)
    tuplified_cities = tuplify_city(world_cities_file)
    print(cities_stats(tuplified_cities))
示例#4
0
def spark(request):
    conf = SparkConf()
    conf.set(
        'spark.jars',
        'https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector'
        '-hadoop2-2.0.1.jar')
    conf.set(
        'spark.jars.packages',
        'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.19.1')
    conf.set('spark.driver.host', '127.0.0.1')
    sc = SparkContext(master='local', conf=conf)

    sc._jsc.hadoopConfiguration().set(
        'fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
    sc._jsc.hadoopConfiguration().set(
        'fs.AbstractFileSystem.gs.impl',
        'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS')
    sc._jsc.hadoopConfiguration().set(
        'google.cloud.auth.service.account.enable', 'true')

    sa_path = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
    if sa_path is not None:
        sc._jsc.hadoopConfiguration().set(
            'google.cloud.auth.service.account.json.keyfile', sa_path)

    spark = SparkSession.builder \
        .config(conf=sc.getConf()) \
        .getOrCreate()

    request.addfinalizer(lambda: spark.stop())

    return spark
示例#5
0
def main_hist():
    spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__])
    num_executor = int(spark_context.getConf().get("spark.executor.instances"))
    world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1],
                                               minPartitions=num_executor)
    histogram_rdd = histogram(world_cities_file)
    for line in histogram_rdd.take(10):
        print(line)
示例#6
0
def main_tuple_2():
    spark_context = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__])
    num_executor = int(spark_context.getConf().get("spark.executor.instances"))
    world_cities_file = spark_context.textFile("hdfs://" + sys.argv[1],
                                               minPartitions=num_executor)
    tuple_2 = tuplify_city(world_cities_file)
    for line in tuple_2.take(10):
        print(line)
示例#7
0
    def test_create_spark_context_first_then_spark_session(self):
        sc = None
        session = None
        try:
            conf = SparkConf().set("key1", "value1")
            sc = SparkContext('local[4]', "SessionBuilderTests", conf=conf)
            session = SparkSession.builder.config("key2", "value2").getOrCreate()

            self.assertEqual(session.conf.get("key1"), "value1")
            self.assertEqual(session.conf.get("key2"), "value2")
            self.assertEqual(session.sparkContext, sc)

            self.assertFalse(sc.getConf().contains("key2"))
            self.assertEqual(sc.getConf().get("key1"), "value1")
        finally:
            if session is not None:
                session.stop()
            if sc is not None:
                sc.stop()
示例#8
0
    def test_create_spark_context_first_then_spark_session(self):
        sc = None
        session = None
        try:
            conf = SparkConf().set("key1", "value1")
            sc = SparkContext("local[4]", "SessionBuilderTests", conf=conf)
            session = SparkSession.builder.config("key2", "value2").getOrCreate()

            self.assertEqual(session.conf.get("key1"), "value1")
            self.assertEqual(session.conf.get("key2"), "value2")
            self.assertEqual(session.sparkContext, sc)

            self.assertFalse(sc.getConf().contains("key2"))
            self.assertEqual(sc.getConf().get("key1"), "value1")
        finally:
            if session is not None:
                session.stop()
            if sc is not None:
                sc.stop()
示例#9
0
def main_join():
    sc = SparkContext(CLUSTER, APP_NAME, pyFiles=[__file__])
    num_executor = int(sc.getConf().get("spark.executor.instances"))
    world_cities_file = sc.textFile("hdfs://" + sys.argv[1],
                                    minPartitions=num_executor)
    region_codes_file = sc.textFile("hdfs://" + sys.argv[2],
                                    minPartitions=num_executor)
    joined_rdd = join(world_cities_file, region_codes_file)
    for line in joined_rdd.take(10):
        print(line)
示例#10
0
def spark_conf():
    """
    This function creates initialize spark objects
    :return: spark object
    """
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    sqlc = SQLContext(sparkContext=sc)
    spark = SparkSession.builder.config(conf=sc.getConf()).enableHiveSupport().getOrCreate()

    return spark, sc, sqlc
示例#11
0
def word_count_sort_repartition_n(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])
        sc = SparkContext(conf=conf,
                          pyFiles=[
                              'run_app_small.py', 'run_app.py',
                              'sesgo_scripts.py', 'persist_scripts.py',
                              'repartition_scripts.py', 'config_scripts.py',
                              'wordCountConfig.py'
                          ])

        data = sc.textFile(data_file)
        repartitionData = data.repartition(
            int(
                int(sc.getConf().get("spark.executor.cores")) *
                int(sc.getConf().get("spark.executor.instances")) *
                float(internal_param[7])))

        words = repartitionData.flatMap(mymapeo)

        print(str(int(data.getNumPartitions())))
        print(str(int(words.getNumPartitions())))

        print('NUM WORDS PARTITIONS ' + str(
            int(
                int(sc.getConf().get("spark.executor.cores")) *
                int(sc.getConf().get("spark.executor.instances")) *
                float(internal_param[7]))))

        frequencies = words.filter(lambda x: x != '').map(
            lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

        repartfrequencies = frequencies.repartition(
            int(
                int(sc.getConf().get("spark.executor.cores")) *
                int(sc.getConf().get("spark.executor.instances")) *
                float(internal_param[7])))

        numWords = data.count()
        sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False)
        topFreqs = sortFreq.take(5)

        print('Number of words: ', numWords)
        print('Words frequencies:', sortFreq.take(5))
        print('Top 5 frequencies:', topFreqs)

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
def word_count_sort_pesist_disk_only(internal_param, data_file):
    try:
        conf = SparkConf().setMaster("spark://dana:7077").setAppName(
            internal_param[1]).setAll([
                ('spark.driver.cores', internal_param[2]),
                ('spark.driver.memory', internal_param[3]),
                ('spark.executor.instances', internal_param[4]),
                ('spark.executor.memory', internal_param[5]),
                ('spark.executor.cores', internal_param[6])
            ])

        sc = SparkContext(conf=conf,
                          pyFiles=['run_app.py', 'config_scriptsDf.py'])
        spark = SparkSession.builder.config(conf=conf).getOrCreate()

        data = sc.textFile(data_file).flatMap(lambda x: x.split(" ")).collect()
        paralData = sc.parallelize(data, 100)
        print(paralData.getNumPartitions())
        print(sc.getConf().get("spark.executor.instances"))

        df = paralData.map(lambda r: Row(r)).toDF(["word"])
        df.show()

        cleanDf = df.filter(col('word') != '').withColumn(
            'word', regexp_replace(col('word'), '[^\sa-zA-Z0-9]', ''))

        result = cleanDf.withColumn(
            'count', lit(1)).groupBy('word').sum('count').withColumnRenamed(
                'sum(count)', 'frequencies')
        result.show()

        numWords = data.count()
        sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False)
        topFreqs = sortFreq.take(5)

        print('Number of words: ', numWords)
        print('Words frequencies:', sortFreq.collect())
        print('Top 5 frequencies:', topFreqs)

        app_id = sc.applicationId
        sc.stop()
        return app_id
    except:
        print("Configuration error: " + str(internal_param))
        sc.stop()
示例#13
0
def build_spark_session(
        app_name: str,
        spark_config: DefaultDict[str, str] = None,
        hadoop_config: DefaultDict[str, str] = None) -> SparkSession:
    conf = SparkConf()
    if spark_config:
        conf.setAll(spark_config.items())

    sc = SparkContext(conf=conf)

    if hadoop_config:
        for k, v in hadoop_config.items():
            sc._jsc.hadoopConfiguration().set(k, v)

    return SparkSession.builder \
        .appName(app_name) \
        .config(conf=sc.getConf()) \
        .getOrCreate()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--review-file", help="The input file of reviews")
    parser.add_argument("--movie-file", help="The input file of movies")
    parser.add_argument("--task-2", action="store_true", help="For task 2")
    parser.add_argument("--K", "-k", help="Number of top K words")
    parser.add_argument("--t")
    parser.add_argument("--tuning", action="store_true")
    args = parser.parse_args()

    review_file = args.review_file
    movie_file = args.movie_file
    is_task_2 = args.task_2
    k = args.K or 10
    k = int(k)
    t = args.t
    tuning = args.tuning

    threads = []
    if tuning:
        threads = [1, 2, 3, 5, 8, 13]
    else:
        threads = [t if t else 1]

    for thread in threads:
        master = 'local[{}]'.format(thread)

        conf = SparkConf().setMaster(master).setAppName("MovieRanking")
        spark_context = SparkContext(conf=conf)

        print('spark conf: {}'.format(str(spark_context.getConf().getAll())))

        time_start = datetime.datetime.now()
        movies_tuple = generate_rdds(movie_file, review_file, spark_context)

        if is_task_2:
            complete_task_2(movies_tuple)
        else:
            complete_task_1(movies_tuple, k)

        time_end = datetime.datetime.now()
        print('threads:{}, duration: {}'.format(thread, time_end - time_start))
        # SparkContext.stop(spark_context)
        spark_context.stop()
示例#15
0
def main():
  
  #parse command line options
  (options,args)=parseOptions()
  
  if len(args) != 2:
   raise Exception("need an input file and an output path")
  
  #set number of file partitions/parallelism
  if options.numPartitions==None:
    #pick number of partitions based on default amount of parallelism and filesize
    partFactor=1#how many times the default parallelism. Defaul Parallelism is 
      #related to the number of cores on the machine.
    numPartitions=sc.defaultParallelism*partFactor
  else:
    numPartitions=options.numPartitions
  
  conf=SparkConf().setAppName("wordCount").setMaster("local["+str(numPartitions)+"]")
  sc = SparkContext(conf=conf)
  conf=sc.getConf()
  print("conf="+str(conf.getAll()))
  print("defaultMinPartitions="+str(sc.defaultMinPartitions))
  print("defaultParallelism="+str(sc.defaultParallelism))
  
  inputFileName = args[0]
  outputFileName= args[1]
  
  timeStart=time.time()
  file = sc.textFile(inputFileName,minPartitions=numPartitions)
  counts = file.count()
  timeEnd=time.time()
  dtRead=timeEnd-timeStart#time in seconds
  
  #write out to a file
  timeStart=time.time()
  file.saveAsTextFile(outputFileName)
  timeEnd=time.time()
  dtWrite=timeEnd-timeStart#time in seconds
  
  print("read+count time="+str(dtRead)+" s")
  print("write time="+str(dtWrite)+" s")
  print("number of lines="+str(counts))
  print("num Partitions="+str(file.getNumPartitions()))
示例#16
0
def main():
    database = "/home/manuelr/ths-client/app/app.db"
    conn = create_connection(database)
    sc = SparkContext(appName='Insert Tweets')
    spark = get_spark_session_instance(sc.getConf())
    cur = conn.cursor()

    result = cur.execute(
        ''' SELECT MAX(date_modified), date_created FROM tweets ''').fetchone(
        )

    if result[1] is None:
        max_date = '2018-10-01 00:00:00.000'
    else:
        max_date = result[1]

    spark.sql('use thsfulltext')
    df = spark.sql('select twitter_id, full_text, inserted_tweet from tweet')
    df = df.filter(
        df.inserted_tweet.between(str(max_date), str(
            datetime.today()))).orderBy(df.inserted_tweet.asc())
    tweets = df.collect()
    count = len(tweets)
    sql_select = ''' SELECT tweet_id FROM tweets WHERE tweet_id = ?'''
    limit = 0
    index = 0
    with conn:
        while limit < 5000:

            while index < count and cur.execute(
                    sql_select,
                [str(tweets[index].twitter_id)]).fetchone() is not None:

                index = index + 1
                print('tweet already inserted')

            if index >= count:
                print('There are no more tweets to insert')
                break
            insert_tweet(conn, tweets[index])

            limit = limit + 1
            index = index + 1
示例#17
0
    .master("yarn") \
    .appName("TimingLimit") \
    .getOrCreate()

order_info = spark.table('dev.dev_lgb_fullStock_TimingLimit_order_info')
waybill_info = spark.table('dev.dev_lgb_fullStock_TimingLimit_waybill_info')

# dev_lgb_fullStock_TimingLimit_order_info
# dev_lgb_fullStock_TimingLimit_waybill_info


# 二、SparkSession 部分     rdd 部分
def rmDirFiles(rm_path):
    '''
    saveAsTextFile 方法不能覆盖文件夹,需要先删除。
    '''
    rm_cmd = 'hadoop fs -rm -r {0}'.format(rm_path)
    try:
        os.system(rm_cmd)
    except:
        print '[ {0} ] the path has already been removed !'.format(rm_cmd)


save_path = r'hdfs://ns15/user/cmo_ipc/longguangbin/work/pytest'

sc = SparkContext(master="yarn", appName="My App")
sc_conf = map(lambda x: x[0] + ':' + x[1], sc.getConf().getAll())
rmDirFiles(save_path + os.sep + 'sc_conf')
sc.parallelize(sc_conf).repartition(1).saveAsTextFile(save_path + os.sep +
                                                      'sc_conf')
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.sql.types import *

sc = SparkContext()
spark = SparkSession.builder.appName(
    "Python Spark SQL basic example").getOrCreate()

# Define the address of the PMI server and the number of MPI workers

hostname = os.uname()[1]
hydra_proxy_port = os.getenv("HYDRA_PROXY_PORT")
pmi_port = hostname + ":" + hydra_proxy_port

sc.getConf().set('mpi', pmi_port)

targetPartition = 4

sizeX = 100
sizeY = 216
sizeZ = 261
sliceSize = sizeX * sizeY

maxIndex = 100 * 216 * 261


def getPartition(value):
    count = 0
    threshold = 0
    for z in zSizes:
示例#19
0
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from utils import *
from pyspark import SparkContext, SparkConf

file_path = "./data.csv"
checkpoint_dir = "./CheckpointDir/"
conf = SparkConf().setAppName("Car Price Prediction").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc.getConf().getAll())
sc.setCheckpointDir(checkpoint_dir)
spark = SQLContext(sc)

data = spark.read.csv(path=file_path,
                      header=True,
                      quote='"',
                      sep=",",
                      inferSchema=True)
data_test, data_train = data.randomSplit(weights=[0.3, 0.7], seed=10)

get_indexer_input = get_indexer_input(data)


def model_training(data_train, indexer_input):
    x_cols = list(
        set(data_train.columns) - set(indexer_input.keys() + ["Price"]))
    str_ind_cols = ['indexed_' + column for column in indexer_input.keys()]
    indexers = indexer_input.values()
    pipeline_tr = Pipeline(stages=indexers)
    data_tr = pipeline_tr.fit(data_train).transform(data_train)
def main():
	sc = SparkContext()
	spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
	print('*'*60, '\n', sc.getConf().getAll(), '\n', '*'*60, '\n') #Prints configuration at start of run on EMR
		
	strip_chars = ".?,!;:\"/>\\'()#&"
	rgx = sc.broadcast(re.compile('[%s]' % strip_chars))

	def process_str(row):
		"""
		Input: text row from dataframe
		Output: list of words with punctuation removed

		Note that this must be declared in main for proper function
		"""
		body_list = []
		try:
			for word in row.lower().split(): 
				word = rgx.value.sub('', word)  
				body_list.append(word)
			return body_list
		except Exception as e:
			print(e)
			print(row)
			return ['']

	#Declaration of 'user defined function' so nodes can use them
	process = udf(process_str, ArrayType(StringType()))
	good_bad = udf(good_bad_filter, IntegerType())
	

	#Directory of reviews: s3://amazon-reviews-pds/tsv/
	#The use of wildcards (*_us_*.gz) allows spark to load all but the non-english reviews
	full_df = spark.read.csv('s3://amazon-reviews-pds/tsv/*_us_*.gz', sep="\t", header=True, inferSchema=True)
	#full_df_in = spark.read.csv('s3://amazon-reviews-pds/tsv/amazon_reviews_us_Video_v1_00.tsv.gz', sep="\t", header=True, inferSchema=True)
	
	#Repartitioning the Dataframe allows each task to be split to the workers
	repartition_num = 1000
	full_df = full_df.repartition(repartition_num)

	#Filters out 3 star ratings, and only keeps the review_headline, review_body, and star_rating columns
	#The good_bad function makes 4 and above become 1 (positive review), and 2 and below become 0 (negative review)
	filtered_df = full_df.select('review_headline', 'review_body', 'star_rating')\
	   .filter(full_df.star_rating != 3)\
			.withColumn('star_rating_filtered', good_bad('star_rating'))

	#Concatinates the review_headline and review_body columns and renames the column 'text'
	two_col_df = filtered_df.select(concat(col('review_headline'), lit(' '), col('review_body')).alias('text'), filtered_df.star_rating_filtered)

	#Turns string into a list of words with the punctuation removed
	text_list_df = two_col_df.withColumn('text_list', process(two_col_df['text']))\
		.select('text_list', 'star_rating_filtered')

	#Fitting and transforming the dataset into a count vectorized form
	cv = CountVectorizer(inputCol="text_list", outputCol="count_vec", minDF=1000)
	cv_fit = cv.fit(text_list_df) #need to save vocabulary from this
	cv_transform = cv_fit.transform(text_list_df)

	#Creates output dataframe, and filters out all reviews that had an error with the star rating
	output_df = cv_transform.select(cv_transform.count_vec, cv_transform.star_rating_filtered)\
		.filter(cv_transform.star_rating_filtered != 2)

	#Saves the vocabulary and processed dataframe to S3 in JSON format
	vocab = spark.createDataFrame(cv_fit.vocabulary, schema=StringType())
	vocab.coalesce(1).write.mode("overwrite").json('s3://dsi-amazon-neural/complete_vocab_newest')

	output_df = output_df.repartition(1000)
	output_df.write.mode("overwrite").json('s3://dsi-amazon-neural/complete_data_newest')

	print('*'*50, '\n'*5, "positive reviews:", output_df.filter(output_df.star_rating_filtered == 1).count(), '*'*50) #138826230 posives recorded
	print('*'*50, '\n'*5, "negative reviews:", output_df.filter(output_df.star_rating_filtered == 0).count(), '*'*50)
示例#21
0
            'Error running command: %s. Return code: %d, Error: %s' % (
                ' '.join(args_list), proc.returncode, errors))
    return (output, errors)

'''
------------------------------------------------------
Intialalize Spark Context Google Translator and LOGGER
------------------------------------------------------
'''
translator = Translator()
conf = SparkConf().setAppName("spark-test").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.config('spark.executor.memory', '1g').getOrCreate()
LOG4JLOGGER = spark.sparkContext._jvm.org.apache.log4j
LOGGER = LOG4JLOGGER.LogManager.getLogger(__name__)
LOGGER.info(sc.getConf().getAll())

'''
---------------------------------------
LOAD CONFIG FILE
---------------------------------------
'''
LOGGER.info("Following config passed to the Driver :")

#ANUVAAD_INPUT_CONFIG= yaml.safe_load(open('config.yml'))

'''
---------------------------------------
Get config values from yml file
---------------------------------------
'''
示例#22
0
#
# sc.binaryFiles()
# sc.binaryRecords()

# sc.cancelAllJobs()
# sc.cancelJobGroup(groupId)
# sc.setJobGroup(groupId,"")

# 将概要信息转储到目录路径中
# sc.dump_profiles(path)

rdd = sc.emptyRDD()
# 创建一个没有分区或元素的RDD。

print(sc.getConf())  # 返回SparkConf对象

# getLocalProperty(key)
# Get a local property set in this thread, or null if it is missing. See setLocalProperty().

# classmethod getOrCreate(conf=None)
"""
sc.hadoopFile()
sc.hadoopRDD()
sheet = sc.newAPIHadoopFile(
    '/user/me/sample.txt',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': 'Time\tMHist'}
)
示例#23
0
def main(sc: SparkContext,
         input_path: str,
         output_path: str,
         is_parquet: bool = False,
         from_local: bool = False,
         to_hdfs: bool = False,
         is_sequence: bool = False,
         is_gzip: bool = False) -> None:

    if not from_local:
        input_path = os.path.join(Config.get_hdfs_basepath(), input_path)

    log.info(f'Loading data from: {input_path}')

    # Stage 1: Read user dataset
    #   step 1: read files from hdfs
    user_path = os.path.join(input_path, 'users/*-r-00000')
    users = sc.wholeTextFiles(user_path) \
        .map(lambda x: (os.path.basename(x[0]), x[1])) \
        .filter(lambda x: x[0][0].isdigit())

    #   step 2: flatten data with user_id
    #       format ('{user_id}-r-0000', 'movie_id,rating,date\n...')
    #           -> (user_id, [movie_id, rating, date])
    users = flatten_user_data(users)
    # use for sparse vector
    max_user_id = users \
        .max(key=lambda x: x[0])[0]

    # Stage 2: Read movie data and flatten
    #movies = sc.textFile(os.path.join(input_path, 'movies/*'))
    #movies = flatten_movie_data(movies)

    # Stage 2: Build sparse vector of ratings
    movies = build_sparse_vector_with_ratings(users, max_user_id)

    # Stage 3: Build all combinations of movie pairs per user
    #   format: [(movie1, movie2),
    #            (SparseVector(movie1_rate), SparseVector(movie2_rate), num_rates1, num_rates2)]
    movie_pairs = build_item_pairs_by_key(movies)

    # Stage 4: Calculate correlations
    #   format: [movie1, movie2, cosine]
    # TODO: add Pearson
    output = calc_similarity(movie_pairs)

    # Stage 6: Output the scores to a file in HDFS/Parquet
    if to_hdfs:
        output_path = os.path.join(Config.get_hdfs_basepath(), output_path)

    if is_parquet:
        from pyspark.sql import SparkSession
        from pyspark.sql.types import StructType
        from pyspark.sql.types import StructField
        from pyspark.sql.types import LongType, FloatType

        # TODO: add Pearson and Jaccard
        schema = StructType([
            StructField('Movie1', LongType(), False),
            StructField('Movie2', LongType(), False),
            StructField('Cosine', FloatType(), True),
            #StructField('Pearson', FloatType(), True),
            StructField('Jaccard-bin', FloatType(), True),
            StructField('Jaccard', FloatType(), True)
        ])

        session = SparkSession.builder.config(conf=sc.getConf()) \
            .getOrCreate()

        output = output \
            .map(lambda x: [*x[0], *map(float, x[1])])

        output_path = os.path.join(output_path, 'result.parquet')
        session.createDataFrame(output, schema=schema) \
            .write \
            .mode('overwrite') \
            .parquet(output_path)

    else:
        # write as simple text with Writable from Hadoop API
        labels = ['Cosine', 'Jaccard-bin', 'Jaccard']

        if is_gzip:
            compClass = 'org.apache.hadoop.io.compress.GzipCodec'
        else:
            compClass = None

        output = output.map(lambda x: (convert_to_writable_format(x[
            0]), convert_to_writable_format(x[1], labels)))
        if to_hdfs:
            if is_sequence:
                output.saveAsNewAPIHadoopFile(
                    output_path,
                    outputFormatClass=
                    'org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat',
                    keyClass='org.apache.hadoop.io.Text',
                    valueClass='org.apache.hadoop.io.Text')
            else:
                output.saveAsTextFile(output_path,
                                      compressionCodecClass=compClass)

        else:
            output.saveAsTextFile(output_path, compressionCodecClass=compClass)
    log.info(f'Saved at: {output_path}')
示例#24
0
class EpidataContext:
    """
    A connection to the epidata server, and all relevant context.
    """
    def __init__(self):

        spark_conf = SparkConf()
        self._sc = SparkContext(os.environ['SPARK_MASTER'],
                                'epidata',
                                conf=spark_conf)

        # get epidata spark conf
        conf = self._sc.getConf()

        cassandra_user = conf.get('spark.cassandra.auth.username', 'cassandra')
        cassandra_pass = conf.get('spark.cassandra.auth.password', 'epidata')
        cassandra_host = conf.get('spark.cassandra.connection.host',
                                  '127.0.0.1')
        cassandra_keyspace = conf.get('spark.epidata.cassandraKeyspaceName',
                                      'epidata_development')
        kafka_brokers = conf.get('spark.epidata.kafka.brokers',
                                 'localhost:9092')
        kafka_batch_duration = int(
            conf.get('spark.epidata.kafka.duration', '6'))
        self._measurement_class = conf.get('spark.epidata.measurementClass',
                                           'sensor_measurement')

        java_import(self._sc._jvm, "com.epidata.spark.EpidataContext")
        self._jec = self._sc._jvm.EpidataContext(self._sc._jsc)

        self._sql_ctx = SQLContext(self._sc, self._jec.getSQLContext())
        self._sql_ctx_pyspark = SQLContext(self._sc)
        self._cassandra_conf = {
            'keyspace': cassandra_keyspace,
            'user': cassandra_user,
            'password': cassandra_pass
        }
        self._has_checked_memory = False
        self._kafka_broker = os.environ.get('KAFKA_BROKER', kafka_brokers)
        self._batch_duration = kafka_batch_duration
        self._ssc = StreamingContext(self._sc, self._batch_duration)

    def query_measurements_original(self, field_query, begin_time, end_time):
        """
        Query for epidata measurements.

        Parameters
        ----------
        field_query : dictionary containing either strings or lists of strings
            A dictionary containing field names and the values those fields must
            contain in matching measurements. Some system configurations require
            that values of specific fields be specified. A string field value
            represents an equality match, while a list value represents set
            membership (all values within the set are matched).
        begin_time : datetime
            Beginning of the time interval to query, inclusive.
        end_time : datetime
            End of the time interval to query, exclusive.

        Returns
        -------
        result : epidata DataFrame
            A DataFrame containing measurements matching the query.
        """
        self._check_cluster_memory()

        java_field_query, java_begin_time, java_end_time = self._to_java_params(
            field_query, begin_time, end_time)

        java_data_frame = self._jec.query(java_field_query, java_begin_time,
                                          java_end_time)
        return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)

    def query_measurements_cleansed(self, field_query, begin_time, end_time):

        self._check_cluster_memory()

        java_field_query, java_begin_time, java_end_time = self._to_java_params(
            field_query, begin_time, end_time)

        java_data_frame = self._jec.queryMeasurementCleansed(
            java_field_query, java_begin_time, java_end_time)
        return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)

    def query_measurements_summary(self, field_query, begin_time, end_time):

        self._check_cluster_memory()

        java_field_query, java_begin_time, java_end_time = self._to_java_params(
            field_query, begin_time, end_time)

        java_data_frame = self._jec.queryMeasurementSummary(
            java_field_query, java_begin_time, java_end_time)
        return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)

    def create_stream(self, ops, original="measurements", clean_up=True):
        esc = EpidataStreamingContext(self._sc, self._ssc,
                                      self._sql_ctx_pyspark, original,
                                      self._kafka_broker, self._cassandra_conf,
                                      self._measurement_class)
        esc.run_stream(ops, clean_up)

    def start_streaming(self):
        def _start():
            self._ssc.start()
            self._ssc.awaitTermination()

        thread = Thread(target=_start)
        thread.start()

    def stop_streaming(self):
        self._ssc.stop(False, True)
        self._ssc = StreamingContext(self._sc, self._batch_duration)

    def create_transformation(self,
                              func,
                              args=[],
                              destination="measurements_cleansed"):
        cassandra_tables = [
            'measurements', 'measurements_original', 'measurements_raw',
            'measurements_cleansed', 'measurements_processed',
            'measurements_summary', 'measurements_aggregates'
        ]
        datastore = "cassandra" if destination in cassandra_tables else "kafka"
        return Transformation(func, args, destination, datastore)

    def list_keys(self):
        """
        List the epidata measurement keys.

        Returns
        -------
        result : epidata DataFrame
            A DataFrame containing values of the principal fields used for
            classifying measurements.
        """
        self._check_cluster_memory()
        return DataFrame(jdf=self._jec.listKeys(), sql_ctx=self._sql_ctx)

    def _check_cluster_memory(self):
        if self._has_checked_memory:
            return
        try:
            spark_ip = re.match('spark://(.*):\d+',
                                os.environ['SPARK_MASTER']).group(1)
            clusterStatus = json.loads(
                urllib.urlopen('http://' + spark_ip + ':18080/json').read())
            if clusterStatus['memory'] - clusterStatus['memoryused'] < 3 * 512:
                raise MemoryError('All cluster memory resources are in use.')
        except MemoryError:
            raise
        except Exception as e:
            print e
            pass
        self._has_checked_memory = True

    def _to_java_params(self, field_query, begin_time, end_time):

        gc = self._sc._gateway._gateway_client

        def to_java_list(x):
            if isinstance(x, basestring):
                return ListConverter().convert([x], gc)
            return ListConverter().convert(x, gc)

        java_list_field_query = {
            k: to_java_list(v)
            for k, v in field_query.items()
        }
        java_field_query = MapConverter().convert(java_list_field_query, gc)
        java_begin_time = self._to_java_timestamp(begin_time)
        java_end_time = self._to_java_timestamp(end_time)

        return java_field_query, java_begin_time, java_end_time

    def _to_java_timestamp(self, dt):
        ts = long(time.mktime(dt.timetuple()) * 1e3 + dt.microsecond / 1e3)
        return self._sc._jvm.java.sql.Timestamp(ts)
示例#25
0
def main():
    sc = SparkContext('local[15]', 'haha')
    # sc._conf.set("spark.python.profile", "true")

    print(sc.getConf().getAll())

    d = load(sc)
    data_train_lp, data_dev_p, label_dev_gt, test_p = d['train_tfidf_lp'], d['dev_tfidf'], d['dev_gt'], d['test_tfidf']
    data_train_p, label_train_gt = d['train_tfidf'], d['train_gt']
    data_train, data_dev, data_test = d['train_raw'], d['dev_raw'], d['test_raw']

    data_train_lp = data_train_lp.sample(False, 0.01)
    
    # print(sum(data_train_lp.first()[0]))
    # print(data_train_lp.zipWithIndex().collect())
    print(data_train_lp.take(2))
    print("___________train_bayes_____________")
    sys.stdout.flush()
    nb = NaiveBayes.train(data_train_lp)
    print("___________trained_bayes___________")
    sys.stdout.flush()
    # nb.save(sc, 'bayes.model')
    bayes_result_dev = nb.predict(data_dev_p).map(int)
    bayes_result_dev.count()
    bayes_result_train = nb.predict(data_train_p).map(int)
    bayes_result_train.count()
    bayes_result_test = nb.predict(test_p).map(int)
    bayes_result_test.count()
    
    print("train info:")
    valid(bayes_result_train, label_train_gt)
    print("dev info:")
    valid(bayes_result_dev, label_dev_gt)

    print("___________train_logistic_____________")
    sys.stdout.flush()
    lg = LogisticRegressionWithSGD.train(data_train_lp, step=0.005)
    print("___________trained_logisitc___________")
    sys.stdout.flush()
    # lg.save(sc, 'logistic.model')
    logistic_result_dev = lg.predict(data_dev_p).map(int)
    logistic_result_train = lg.predict(data_train_p).map(int)
    logistic_result_test = lg.predict(test_p).map(int)

    print("train info:")
    valid(logistic_result_train, label_train_gt)
    print("dev info:")
    valid(logistic_result_dev, label_dev_gt)

    fused_train_p = stack_label([bayes_result_train, logistic_result_train])
    fused_dev_p = stack_label([bayes_result_dev, logistic_result_dev])
    fused_test_p = stack_label([bayes_result_test, logistic_result_test])

    fused_train_lp = label(data_train, fused_train_p)

    print("___________train_GBDT___________")
    sys.stdout.flush()
    gbdt = GradientBoostedTrees.trainClassifier(fused_train_lp, {})
    print('___________trained_GBDT_________')
    sys.stdout.flush()

    fused_result_train = gbdt.predict(fused_train_p)
    fused_result_dev = gbdt.predict(fused_dev_p)
    fused_result_test = gbdt.predict(fused_test_p)

    print("train info:")
    valid(fused_result_train, label_train_gt)
    print("dev info:")
    valid(fused_result_dev, label_dev_gt)

    dump(fused_result_test.map(int).collect())

    sc.show_profiles()
示例#26
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("HelloPython").setMaster("local")
sc = SparkContext(conf=conf)
print sc.getConf().getAll()
sc.stop()

spark = SparkSession.builder.appName("spark").getOrCreate()
print spark.sparkContext.getConf().getAll()
spark.sparkContext.stop()
示例#27
0
conf=(SparkConf()
    .setMaster("yarn")
    .setAppName("Stack Overflow Test")
    .set("spark.executor.memory","2g"))
sc=SparkContext(conf=conf)

from pyspark import SparkContext 
SparkContext.setSystemProperty('spark.executor.memory','2g')

from pyspark import SparkContext 
sc=SparkContext("yarn","StackOverflowTest", py-Files=['sotest.py','lib.zip'])

# Run in terminal
# spark2-submit --executor-memory 4g stackoverflowtest.py

sc.getConf().getAll()
[(u'spark.driver.host', u'10.0.2.104'), (u'spark.eventLog.enabled', u'true'), (u'spark.ui.proxyBase', u'/proxy/application_1511794877761_0014'), (u'spark.driver.extraLibraryPath', u'/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/hadoop/lib/native'), 

from pyspark.conf import SparkConf 
SparkSession.builder.config(conf=SparkConf())

SparkSession.builder.config('spark.eventLog.dir', '/stackexchange/logs')

spark=SparkSession.builder \
      .master('yarn') \
      .appName('StackOverflowTest') \ 
      .config('spark.executor.memory', '2g') \ 
      .getOrCreate()

spark=SparkSession.builder \
      .master('yarn') \
示例#28
0
        'C': C,
        'T': T,
        'distances': distances,
        'iterations': iterations
    }


def lowest_mean_distance_reduce(x, y):
    mean_distance_x = np.mean(x['distances'])
    mean_distance_y = np.mean(y['distances'])
    if mean_distance_x > mean_distance_y:
        return y
    else:
        return x


if __name__ == "__main__":
    sc = SparkContext(appName="Spark ICP room finder")
    spark = SparkSession.builder.appName("Spark ICP room finder").getOrCreate()
    num_of_executors = int(sc.getConf().get("spark.executor.instances"))
    int_rdd = sc.parallelize(range(num_of_executors))
    int_rdd.map(lambda x: import_packages(x)).collect()
    A_ply = sys.argv[1]
    room_paths = []
    for filename in os.listdir(sys.argv[2]):
        room_paths.append(filename)
    dist_room_paths = sc.parallelize(room_paths)
    most_probable_room = dist_room_paths.map(run_icp_map).reduce(
        lowest_mean_distance_reduce)
    print(most_probable_room)
示例#29
0
def _pr_spark_conf(sc: SparkContext):
    pr_red('Number of Works:     ' + str(sc.defaultParallelism))
    pr_red('Driver Memory:       ' + sc.getConf().get("spark.driver.memory"))
    pr_red('Maximum Result Size: ' + sc.getConf().get("spark.driver.maxResultSize"))
示例#30
0
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col, lit, regexp_replace, udf
from functools import reduce
from aut import *
import time
import argparse

conf = SparkConf()

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
WB_PATH = "/dfs/dataset/wb"
print(sc.getConf().getAll())


def timer(task, start, end):
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    time_text = "[{}]{:0>2}:{:0>2}:{:05.2f}\n".format(task, int(hours),
                                                      int(minutes), seconds)
    print_and_log(time_text)


def print_and_log(text):
    print(text)
    write_log(text)

from pyspark import SparkContext, SparkConf
import math
from pyspark.mllib.recommendation import ALS

conf = SparkConf().setAll([('spark.executor.memory', '20g'),
                           ('spark.executor.cores', '4'),
                           ('spark.cores.max', '24'),
                           ('spark.driver.memory', '30g'),
                           ('spark.driver.cores', '4'),
                           ('spark.executor.instances', '4')])

sc = SparkContext(conf=conf)
sc.getConf().getAll()
raw_data = sc.textFile("CHANGEME")
data = raw_data.map(lambda line: line.split(',')).map(
    lambda tokens: (tokens[0], tokens[1], tokens[2])).cache()

#split dataset
training_RDD, validation_RDD, test_RDD = data.randomSplit([6, 2, 2])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

#training parameters
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

#training
    def __init__(self, spark_context: SparkContext):
        conf: SparkConf = spark_context.getConf()
        log4j = spark_context._jvm.org.apache.log4j
        message_prefix = f"<{conf.get('spark.app.id')} {conf.get('spark.app.name')}>"

        self._log4j_logger = log4j.LogManager.getLogger(message_prefix)