def occCalc(self, channelID, testing=False): """ Calculates occupancy for the user defined month """ if type(channelID) != list: raise TypeError('ChannelID is required to be a list') conf = SparkConf()\ .setAppName("Occupancy Calc")\ .set("spark.master", "local[*]")\ .set("spark.driver.maxResultSize", "15G") sc = SparkContext(conf=conf) sql = SQLContext(sc) path = 'AZURE PATH' + self.month +\ '/*/*/' + self.sensor + '*' data = sql.read.parquet(path) timeCount = data.select('scan_time').distinct().count() timeCount = sc.broadcast(timeCount) subData = data.select('scan_time', 'channel_id', 'power_dbm').filter( data.channel_id.isin(channelID)) subData = subData.groupBy('channel_id').agg( (count(column('power_dbm')) / timeCount.value).alias('freq'), stddev(column('power_dbm')).alias('sd')).sort( asc('freq'), desc('sd')) if testing: subData.toPandas().to_csv('C:/path/freq.csv', sep='\t') sc.stop() else: sc.stop() return (subData.toPandas())
def identify(log): """Identify Append next action and timespan to each record and remove timeout. Arguments: log: spark dataframe contains the original log Returns: spark dataframe of log with next action and timespan in """ # window lag win = Window.partitionBy(LogFile.usrid).orderBy( column(LogFile.stamp).desc() ) log = log.withColumn("next_", lag(LogFile.event).over(win)) log = log.withColumn("next_stamp", lag(LogFile.stamp).over(win)) log = log.na.drop() # timespan TIME_FMT = "yyyy-MM-dd HH:mm:ss" TIME_DIF = unix_timestamp( "next_stamp", format=TIME_FMT ) - unix_timestamp(LogFile.stamp, format=TIME_FMT) log = log.withColumn("restm", TIME_DIF) # time out log = log.filter(column("restm") <= 1800) # formatting return log.select( LogFile.stamp, LogFile.usrid, LogFile.event, "next_", "restm" )
def main(args): spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) sentiments_struct = types.ArrayType( types.MapType(types.StringType(), types.FloatType(), False)) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) def sentiment_generator_impl(text): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiment = [va.polarity_scores(str(s)) for s in sents] return sentiment sentiment_generator = functions.udf(sentiment_generator_impl, sentiments_struct) def json_converter_impl(user_id, update_id, text, sentiments): obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) return json.dumps(obj) json_converter = functions.udf(json_converter_impl, types.StringType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.intopic).load().select( functions.column('value').cast(types.StringType()).alias('value') ).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator(functions.column('json.text')).alias( 'sentiments')).select( json_converter(functions.column('user_id'), functions.column('update_id'), functions.column('text'), functions.column('sentiments')). alias('value')).writeStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('topic', args.outtopic).option( 'checkpointLocation', '/tmp').start()) records.awaitTermination()
def main(args): """Configure and start the Kafka stream processor""" # acquire a SparkSession object spark = ( sql.SparkSession.builder.appName('kafka-spark-python').getOrCreate()) # if a user function is specified, download it and import it if args.userfunction is not None: try: logging.info('downloading user function') logging.info(args.userfunction) dl = urllib.urlretrieve(args.userfunction)[0] loader = importlib.SourceFileLoader('userfunction', dl) userfunction = pytypes.ModuleType(loader.name) loader.exec_module(userfunction) user_function = functions.udf(userfunction.user_defined_function, types.StringType()) logging.info('user function loaded') except Exception as e: logging.error('failed to import user function file') logging.error(e) user_function = None # configure the operations to read the input topic records = ( spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.intopic).load().select( functions.column('value').cast( types.StringType()).alias('value')) # add your data operations here, the raw message is passed along as # the alias `value`. # # for example, to process the message as json and create the # corresponding objects you could do the following: # # .select( # functions.from_json( # functions.column('value'), msg_struct).alias('json')) # # the following operations would then access the object and its # properties using the name `json`. ) # if it exists, add the user function to the stream pipeline if user_function is not None: records = (records.select( user_function(functions.column('value')).alias('value')).where( 'value is not null')) # configure the output stream writer = (records.writeStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('topic', args.outtopic).option('checkpointLocation', '/tmp').start()) # begin processing the input and output topics writer.awaitTermination()
def _load_data(self): database = SparkDatabase() log = database.load("log") user = log.groupby("usrid").count().filter(column("count") < 100) log = log.join(user, "usrid", "leftanti") return log.filter(column("event").isin(self.events))
def main(): df = spark.read.text('/var/tmp/coursera-data/final/en_US/en_US.news.txt') # df = spark.createDataFrame(range(int(1e6)), IntegerType()) data_out_scala = (df.withColumn('sentences', do_split('value')).select( explode(column('sentences')).alias('sentence')).withColumn( 'tokens', do_tokenise('sentence')).select(explode( column('tokens'))).groupBy('col').count().orderBy( 'count', ascending=False)) data_out_scala.show()
def collect_data(self): """Collect user behavior dataframe""" log = self._load_data() agg = log.groupby("usrid", "event").agg(avg("restm").alias("avgrt")) data = [] for i, e in enumerate(self.events): t = agg.filter(column("event") == e) t = t.withColumn("avgrt", round("avgrt", 2)) data.append( t.select(column("usrid"), column("avgrt").alias(str(i)))) return reduce(lambda x, y: x.join(y, ["usrid"], how="full"), data)
def load(self): csv = spark.read.csv(self.path, header=True) # merge date time datetime = concat(column("date"), lit(" "), column("time")) tmp = csv.withColumn("datetime", datetime.cast(TimestampType())) # format column name log = tmp.select( column("datetime").alias(self.stamp), column("ip").alias(self.usrid), column("extention").alias(self.event), ) return log
def test_using_select_expr(self): df = self.df.select(expr("DEST_COUNTRY_NAME"), col("DEST_COUNTRY_NAME"), column("DEST_COUNTRY_NAME")) self.assertListEqual( df.columns, ['DEST_COUNTRY_NAME', 'DEST_COUNTRY_NAME', 'DEST_COUNTRY_NAME'])
def convert_column(df, col_name, col_type_str): new_col = col_name + "_tmp" df = df.withColumn(new_col, F.column(col_name).cast(col_type_str)).drop( col_name).withColumnRenamed(new_col, col_name).drop(new_col) return df
def main(args): spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) vhost_bcast = args.vhost vport_bcast = args.vport def sentiment_generator_impl(text, user_id, update_id): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiments = [va.polarity_scores(str(s)) for s in sents] obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) try: con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast) con.request('POST', '/', body=json.dumps(obj)) con.close() except Exception as e: logging.warn('unable to POST to visualizer, error:') logging.warn(e.message) sentiment_generator = functions.udf(sentiment_generator_impl, types.NullType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.topic).load().select( functions.column('value').cast( types.StringType()).alias('value')).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator( functions.column('json.text'), functions.column('json.user_id'), functions.column('json.update_id'))). writeStream.format("console").start()) records.awaitTermination()
def getitem(self, file_path): # read from json file, explode the nested hirearchy, and form as a DataFrame js_task = ReadJsonFile(file_path) uiid = file_path.split('/')[1].split('.')[0] js_df = js_task.load() new_rdd = js_df.rdd.map(lambda row: row.asDict(True)) d = new_rdd.collect()[0] elements = [] self.iterdict(d, elements) # deal with empty layouts try: element_df = spark.createDataFrame(Row(**x) for x in elements) clean_df = element_df.select( column('bounds'), column('componentLabel').alias('componentlabel')) self.df = clean_df.withColumn('uiid', lit(uiid)) except: self.df = None return self.df
def train_val_test_split(spark, dirname, random_seed): # Read in downsampled interactions interactions = spark.read.parquet( f'{dirname}/{dirname}_subsamples.parquet') interactions = interactions.filter(interactions.rating > 0).drop( 'is_read', 'is_reviewed') # Find all unique user ids with interactions number > 10 userids = interactions \ .groupby('user_id') \ .count().alias('count') \ .filter(column('count') > 10) \ .select('user_id') # Sample train, val, test user train_user = userids.sample(False, TRAIN_FRAC, rand_seed) remaining_user = userids.subtract(train_user) val_user = remaining_user.sample(False, VAL_FRAC / (1 - TRAIN_FRAC), rand_seed) test_user = remaining_user.subtract(val_user) # Construct train, val, test interactions train_all = train_user.join(interactions, on='user_id', how='inner') val_all = val_user.join(interactions, on='user_id', how='inner') test_all = test_user.join(interactions, on='user_id', how='inner') # Split val and test in half window = Window.partitionBy('user_id').orderBy('book_id') val_interactions = (val_all.select( "user_id", "book_id", "rating", row_number().over(window).alias("row_number"))) test_interactions = (test_all.select( "user_id", "book_id", "rating", row_number().over(window).alias("row_number"))) val_add2train = val_interactions.filter(val_interactions.row_number % 2 == 0).drop('row_number') val = val_interactions.filter(val_interactions.row_number % 2 == 1).drop('row_number') test_add2train = test_interactions.filter(test_interactions.row_number % 2 == 1).drop('row_number') test = test_interactions.filter(test_interactions.row_number % 2 == 0).drop('row_number') # Add half val and half test back to train train = train_all.union(val_add2train).union(test_add2train) # Write train_set, val_set, test_set out train.write.mode('overwrite').parquet(f'{dirname}/train.parquet') val.write.mode('overwrite').parquet(f'{dirname}/val.parquet') test.write.mode('overwrite').parquet(f'{dirname}/test.parquet')
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed): val_set = spark.read.parquet(f'{dirname}/val.parquet') print( f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...' ) # load corresponding trained model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # computing RMSE on validation set predictions = model.transform(val_set) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predictions) print(f'rmse: {rmse}') print(f'Constructing top {k} books recommended to per user ...') val_users = val_set.select('user_id').distinct() start_time = time.time() perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k) myudf = udf(extract_item, ArrayType(IntegerType())) perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn( 'predictions', myudf(perUserPredictedTopKItemsDF['recommendations'])).drop( 'recommendations') print('Constructing actual books per user ...') perUserActualItemsDF = val_set.filter( column('rating') >= 3.0).groupBy('user_id').agg( expr('collect_list(book_id) as book_ids')) print('Constructing Ranking Metrics ...') perUserItemsRDD = perUserPredictedTopKItemsDF.join( perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) precisionAtK = rankingMetrics.precisionAt(k) mAP = rankingMetrics.meanAveragePrecision end_time = time.time() time_delta = str(datetime.timedelta(seconds=end_time - start_time)) print(f'p@{k}: {precisionAtK}') print(f'mAP: {mAP}') print(f'run time: {time_delta}')
def view(self, data, pred): """Use PCA to reduce dimension and visualize the data""" pca = PCA(k=3, inputCol="scaled", outputCol="pca-3") model = pca.fit(data) transformed = model.transform(data) view = (transformed.select("prediction", "pca-3").withColumn( "axis", self.to_array( column("pca-3"))).select(["prediction"] + [column("axis")[i] for i in range(3)])) dataframe = view.toPandas() fig = pyplot.figure(figsize=(20, 20)) ax = fig.add_subplot(111, projection="3d") ax.scatter( dataframe.iloc[:, 1], dataframe.iloc[:, 2], dataframe.iloc[:, 3], c=dataframe.iloc[:, 0] + 2, ) pyplot.show()
def main(Number, Query, File, write=False): print("\nSetting up the enviroment\n") conf = SparkConf().setAppName("Searcher") # configure Spark sc = SparkContext( conf=conf) # start Spark Context with the specific configuration sql = SQLContext(sc) # start Spark SQL print("\nLoading the data\n") data = sql.read.load(File) print("\nQuerying the keywords in the database\n") totKeyword = len(Query) filtered = data.filter( column('word').isin([word.lower() for word in Query ])) # Query the database based on request sumed = filtered.groupby(filtered.location).agg( sum('tfIdf').alias("tot")) # Sum the TFIDF scores counted = filtered.groupby(filtered.location).count() counted = counted.select( counted.location.alias("loc"), (column("count") / totKeyword).alias("freq")) # determine the weight for each word result = sumed.join(counted, on=sumed.location == counted.loc, how="inner") # join the tables result = result.select( result.location, (column("tot") * column("freq")).alias("score")).orderBy( desc("score")).limit( Number) # Calculate score and return top N values if write: print("\nWriting the data\n") result.write.format('com.databricks.spark.csv').save('query_' + ''.join(Query), header='true') else: result.show()
def main(): conf = SparkConf().setAppName("Index Builder") # configure Spark sc = SparkContext( conf=conf) # start Spark Context with the specific configuration sql = SQLContext(sc) # start Spark SQL text = sc.wholeTextFiles( "/user/root/bbcsport/*") # fuzy read: Reads all files under bbcsport fileCount = text.count() # reformat data to make it cleaner and break text into words cleaned = text.map(lambda file: ("%s/%s" % (file[0].split("/")[len(file[0].split("/"))-2],\ file[0].split("/")[len(file[0].split("/"))-1]), file[1].lower().split()))\ .map(lambda file: (file[0], [re.sub(r'[^\w\s]', '', word) for word in file[1]])) # regex cleaning from: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python cleanedDF = cleaned.toDF(["location", "words"]) # create dataframe cleanedDF = cleanedDF.select( cleanedDF.location, explode(cleanedDF.words).alias("word")) # Flatten the list of words tfMap = cleanedDF.groupby( cleanedDF.location, cleanedDF.word).count() #Count occurences of a word in a document tfReduce = tfMap.groupby(tfMap.location, tfMap.word).agg( sum("count").alias("tf")) # Calculate TF idfMap = cleanedDF.distinct().groupby( cleanedDF.word).count() # count whether a word occured in a document idfReduce = idfMap.select( idfMap.word, log(fileCount / (column("count"))).alias("idf")) # Calculate IDF joinTfIdf = tfReduce.join(idfReduce, tfReduce.word == idfReduce.word, "inner") # Join TF & IDF tables tfIdf = joinTfIdf.select(column("location"), tfReduce["word"], (column("tf") * column("idf")).alias("tfIdf")) # Calc. TFIDF tfIdf.write.parquet( 'bbc.parquet') # write file in an efficient file format
def down_sample(data, ratio_adjust=1.2): counts = (data.select('stars').groupBy('stars').count().orderBy( F.column("count"), ascending=False).collect()) higher_bound = counts[0][1] lower_bound = counts[1][1] rand_gen = lambda x: randint(0, higher_bound) if x == "true" else -1 udf_rand_gen = F.udf(rand_gen, IntegerType()) threshold_to_filter = int(ratio_adjust * float(lower_bound) / higher_bound * higher_bound) data = data.withColumn("randIndex", udf_rand_gen("stars")) sampled_training_data = data.filter( data['randIndex'] < threshold_to_filter) return sampled_training_data.drop('randIndex')
def preprocess(self): # read from csv file, load as DataFrame csv_task = ReadCSVFile(self.path) orig = csv_task.load() self.orig = orig # rename columns self.csv = orig.select( column('App Package Name').alias('appname'), column('Play Store Name').alias('name'), column('Category').alias('category'), column('Average Rating').alias('rating'), column('Number of Ratings').alias('ratenum'), column('Number of Downloads').alias('dlnum'), column('Date Updated').alias('update'), column('Icon URL').alias('url'))
def cluster(self, data): """Run k-means to cluster user into 2 groups""" for col in data.schema.names[2:]: data = data.withColumn(col, column("0") / column(col)) data = data.dropna() vecAssembler = VectorAssembler(inputCols=data.schema.names[1:], outputCol="features") data = vecAssembler.transform(data) scaler = StandardScaler( inputCol="features", outputCol="scaled", withStd=True, withMean=True, ) scalerModel = scaler.fit(data) data = scalerModel.transform(data) kmeans = KMeans().setK(2).setFeaturesCol("scaled").setSeed(666) model = kmeans.fit(data) predictions = model.transform(data) return predictions.select("usrid", "prediction")
def average_distance(co_ordinates_df): # sorting done on basis of time provided and related lat, lng captured window = Window.partitionBy(f.col('id')).orderBy(f.col('time')) sorted_df = co_ordinates_df.withColumn('lat_lng_list', f.collect_list("coordinate").over(window)) sorted_df = sorted_df.groupBy(f.col('id')) \ .agg(f.max('lat_lng_list').alias('lat_lng_list')) from pyspark.sql.functions import udf # Udf is used to pass list of lat, lng values and get average distance avg_udf = udf(calculate_average) result_df = sorted_df.withColumn('avg_dist', avg_udf(f.column('lat_lng_list'))) result_df = result_df.select('id', 'avg_dist') return result_df
def shortest_path(v_from, v_to, df, max_path_length=10): """ v_from - исходная вершина v_to - целевая вершина df - Spark DataFrame с ребрами графа max_path_length - максимальная длина пути Возвращает: pyspark.sql.DataFrame, состоящий из одного столбца с найдеными путями """ temp_df = df.filter(df.follower_id == v_from) temp_df = temp_df.select( f.col('user_id').alias('last_neighbour'), f.col('follower_id').alias('path')) for i in range(max_path_length): if temp_df.filter(temp_df.last_neighbour.isin(v_to)).count() > 0: result_df = temp_df.filter(temp_df.last_neighbour.isin(v_to))\ .select(f.concat('path', f.lit(','), 'last_neighbour').alias('path')) return result_df temp_df = temp_df.join(df, temp_df.last_neighbour==df.follower_id, how="inner",)\ .select(f.column('user_id').alias('last_neighbour'), f.concat('path', f.lit(','), 'last_neighbour').alias('path'))
def _keys(r: Type[Relation], data: Union["pyspark.sql.DataFrame", "pandas.DataFrame"]): if is_pyspark_df(data, r): from pyspark.sql.functions import column, count duplicated_rows = (data.groupby(r.get_key_field_names()).agg( count("*").alias("count")).filter(column("count") > 1).count()) if duplicated_rows > 0: raise ValueError(f"Key error for '{r.name()}': " f"using keys '{r.get_key_field_names()}'" f" there are {duplicated_rows} duplicates.") elif is_pandas_df(data, r): duplicated = data[r.get_key_field_names()].duplicated() duplicated_rows = len(data[duplicated]) if duplicated_rows > 0: raise ValueError(f"Key error for '{r.name()}': " f"using keys '{r.get_key_field_names()}'" f" there are {duplicated_rows} duplicates.") logger.info(f"Relation {r.name()} has passed the key unqiue-ness check.")
def preprocess(self): # read from csv file, load as DataFrame csv_task = ReadCSVFile(self.path) imgfolder = csv_task.img_path() file = open('localdata/train_list', 'rb') train_list = pickle.load(file) train_id_list = [s.split('/')[1].split('.')[0] for s in train_list] file.close() orig = csv_task.load() self.orig = orig # rename columns csv_df = orig.select( column('UI Number').alias('uiid'), column('App Package Name').alias('appname'), column('Interaction Trace Number').alias('trace'), column('UI Number in Trace').alias('uicount')).where( column("uiid").isin(train_id_list)) writepath_udf = udf(lambda uiid: imgfolder + uiid + '.png', StringType()) self.csv = csv_df.withColumn("path", writepath_udf(column("uiid"))) self.num = self.csv.count()
# this is just the treebank tokenizer return [word for word in t.tokenize(s) if word not in stopwords_set] udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType())) (idb_df .select(sql.concat_ws(" ", idb_df["data.dwc:occurrenceRemarks"], idb_df["data.dwc:eventRemarks"], idb_df["data.dwc:fieldNotes"] ) .alias("note"), idb_df["uuid"] ) .where(sql.column("note") != "") .withColumn("tokens", udf_tokenize(sql.column("note"))) .select(sql.column("uuid"), sql.explode(sql.column("tokens")).alias("token") ) .groupBy(sql.column("uuid"), sql.column("token")) .count() .write .mode("overwrite") .parquet("/guoda/data/idigbio-{}-tf.parquet".format(idb_df_version)) ) # 4:09 on mesos1 with 96 cores #time HADOOP_USER_NAME=hdfs spark-submit --master mesos://mesos01.acis.ufl.edu:5050 --executor-memory 20G --driver-memory 10G --total-executor-cores 96 idb_tf_index.py
with the nltk library ''' # word_tokenize uses PunktSentenceTokenizer first, then # treebank_word_tokenizer on those so can get nested # lists. #return nltk.tokenize.word_tokenize(s) # this is just the treebank tokenizer return [word for word in t.tokenize(s) if word not in stopwords_set] udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType())) (bhl_df .withColumn("tokens", udf_tokenize(sql.column("ocrtext"))) .select(sql.column("itemid"), sql.explode(sql.column("tokens")).alias("token") ) .groupBy(sql.column("itemid"), sql.column("token")) .count() .write .mode("overwrite") .parquet("/guoda/data/bhl-{}-tf.parquet".format(bhl_df_version)) ) # 4:09 on mesos1 with 96 cores #time HADOOP_USER_NAME=hdfs spark-submit --master mesos://mesos01.acis.ufl.edu:5050 --executor-memory 20G --driver-memory 10G --total-executor-cores 96 bhl_tf_index.py # still hangs
df = spark.read.format("json").schema(myManualSchema)\ .load("/data/flight-data/json/2015-summary.json") # in Python from pyspark.sql import Row myRow = Row("Hello", None, 1, False) # in Python df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2) # select columns in different ways from pyspark.sql.functions import expr, col, column df.select( expr("DEST_COUNTRY_NAME"), col("DEST_COUNTRY_NAME"), column("DEST_COUNTRY_NAME"))\ .show(2) df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2) df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")) df.selectExpr( "*", # all original columns "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\ .show(2) df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2) # we need to pass explicit values into Spark that are just a value (rather than a new # column). from pyspark.sql.functions import lit
""" 基本PySpark語法 : 透過PySpark Shell""" from pyspark.sql.types improt * sc = SparkContext() # PySpark入口 sqlContext = SQLContext(sc) df = sqlContext.read.format("json").load("./spark-2.4.4-bin-hadoop2.7/examples/src/main/resources/employees.json") """查看Schema""" df.printSchema() """建構欄位""" from pyspark.sql.functions import col, column, expr col("col1") column("col2") """查看欄位""" df.columns """獲取第一條row""" df.first() """查看數據類型""" df.select("salary").dtypes """ 建立DataFrame 方法: createOrReplaceTempView 說明:存在就替換,不存在就創建 """ df.createOrReplaceTempView("dfTable")
from pyspark.sql.types import StructField, StructType, StringType, LongType myManualSchema = StructType([ StructField("DEST_COUNTRY_NAME", StringType(), True), StructField("ORIGIN_COUNTRY_NAME", StringType(), True), StructField("count", LongType(), False, metadata={"hello": "world"}) ]) df=spark.read.format("json").schema(myManualSchema)\ .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json") #컬럼 생성 from pyspark.sql.functions import col, column col("NewColumn") column("NewColumn") #Catalog가 분석을 실행하기 전까지 확인X #DAG를 나타내는 예제 from pyspark.sql.functions import expr expr("(((someCol+5)*200)-6)<otherCol") #DataFrame의 column에 접근 spark.read.format("json").load( "/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json" ).columns #Row 객체 확인 df.first() #Row 객체 생성
spark = SparkSession.builder.master("local").appName("Testing") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df_1 = create_frame(spark) df_2 = df_1.withColumnRenamed("COL_1", "COL_2").withColumnRenamed("PKEY_1", "PKEY_2") \ .withColumnRenamed("FKEY_1", "FKEY_2").withColumnRenamed("NAME_1", "NAME_2") df_3 = df_1.withColumnRenamed("COL_1", "COL_3").withColumnRenamed("PKEY_1", "PKEY_3") \ .withColumnRenamed("FKEY_1", "FKEY_3").withColumnRenamed("NAME_1", "NAME_3") df_4 = df_1.withColumnRenamed("COL_1", "COL_4").withColumnRenamed("PKEY_1", "PKEY_4") \ .withColumnRenamed("FKEY_1", "FKEY_4").withColumnRenamed("NAME_1", "NAME_4") df_5 = df_1.withColumnRenamed("COL_1", "COL_5").withColumnRenamed("PKEY_1", "PKEY_5") \ .withColumnRenamed("FKEY_1", "FKEY_5").withColumnRenamed("NAME_1", "NAME_5") df_6 = df_1.withColumnRenamed("COL_1", "COL_6").withColumnRenamed("PKEY_1", "PKEY_6") \ .withColumnRenamed("FKEY_1", "FKEY_6").withColumnRenamed("NAME_1", "NAME_6") df_7 = df_1.withColumnRenamed("COL_1", "COL_7").withColumnRenamed("PKEY_1", "PKEY_7") \ .withColumnRenamed("FKEY_1", "FKEY_7").withColumnRenamed("NAME_1", "NAME_7") join_1 = (F.column("FKEY_2") == F.column("PKEY_1")) join_2 = (F.column("FKEY_3") == F.column("PKEY_2")) join_3 = (F.column("FKEY_4") == F.column("PKEY_3")) join_4 = (F.column("FKEY_5") == F.column("PKEY_4")) join_5 = (F.column("FKEY_6") == F.column("PKEY_5")) result = df_1.filter("FKEY_1 is null").join(df_2, join_1, "left") result = result.join(df_3, join_2, "left") result = result.join(df_4, join_3, "left") result = result.join(df_5, join_4, "left") result.show()
from pyspark.sql.types import StructField, StructType, StringType, LongType myManualSchema = StructType([ StructField("DEST_COUNTRY_NAME", StringType(), True), StructField("ORIGIN_COUNTRY_NAME", StringType(), True), StructField("count", LongType(), False, metadata={"hello":"world"}) ]) df = spark.read.format("json").schema(myManualSchema)\ .load("/data/flight-data/json/2015-summary.json") # COMMAND ---------- from pyspark.sql.functions import col, column col("someColumnName") column("someColumnName") # COMMAND ---------- from pyspark.sql.functions import expr expr("(((someCol + 5) * 200) - 6) < otherCol") # COMMAND ---------- from pyspark.sql import Row myRow = Row("Hello", None, 1, False) # COMMAND ----------
from pyspark.sql.types import StructField, StructType, StringType, LongType myManualSchema = StructType([ StructField("DEST_COUNTRY_NAME", StringType(), True), StructField("ORIGIN_COUNTRY_NAME", StringType(), True), StructField("count", LongType(), False, metadata={"hello": "world"}) ]) df = spark.read.format("json").schema(myManualSchema)\ .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json") # COMMAND ---------- from pyspark.sql.functions import col, column col("someColumnName") column("someColumnName") # COMMAND ---------- from pyspark.sql.functions import expr expr("(((someCol + 5) * 200) - 6) < otherCol") # COMMAND ---------- from pyspark.sql import Row myRow = Row("Hello", None, 1, False) # COMMAND ---------- myRow[0] myRow[2]
app_df, ui_df = prefilter(db, test_input) else: app_df = db.load(table='app') ui_df = db.load(table='ui') return app_df, ui_df if __name__ == '__main__': db = Database() # load in a test sample file = open('localdata/test_ui', 'rb') test_ui = pickle.load(file) file.close() # prefiltering by category time_start=time.time() app_df, ui_df = load_from_db(db, test_ui, is_prefilter = True) ui_targets = ui_df.select(column('uiid'),column('appname')).collect() time_end=time.time() print('Read from sql time cost',time_end-time_start,'s') # use the ML/DL models time_start=time.time() model_demo = Model(test_ui, ui_targets) sim_df_pd = model_demo.cos_similarity() sim_app_df = ui_df.select(column('uiid'),column('appname')).where(ui_df.uiid.isin(sim_df_pd.uiid.iloc[0:6].tolist())) sim_app = sim_app_df.collect() result_df = app_df.where(app_df.appname.isin([row.appname for row in sim_app])) outputs = result_df.join(sim_app_df,"appname","left").collect() time_end=time.time() print('Model time cost',time_end-time_start,'s') file = open('localdata/result_demo', 'wb')