예제 #1
0
파일: test_udf.py 프로젝트: drewrobb/spark
    def test_udf_in_generate(self):
        from pyspark.sql.functions import udf, explode
        df = self.spark.range(5)
        f = udf(lambda x: list(range(x)), ArrayType(LongType()))
        row = df.select(explode(f(*df))).groupBy().sum().first()
        self.assertEqual(row[0], 10)

        df = self.spark.range(3)
        res = df.select("id", explode(f(df.id))).collect()
        self.assertEqual(res[0][0], 1)
        self.assertEqual(res[0][1], 0)
        self.assertEqual(res[1][0], 2)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 2)
        self.assertEqual(res[2][1], 1)

        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
        res = df.select("id", explode(range_udf(df.id))).collect()
        self.assertEqual(res[0][0], 0)
        self.assertEqual(res[0][1], -1)
        self.assertEqual(res[1][0], 0)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 1)
        self.assertEqual(res[2][1], 0)
        self.assertEqual(res[3][0], 1)
        self.assertEqual(res[3][1], 1)
예제 #2
0
def process(rdd):
  print(">>>> BEGIN CASS")
  wonbids = getSqlContextInstance(rdd.context).createDataFrame(rdd) 
  wonbids.registerTempTable("wonbids")
  wonbids.write.format("org.apache.spark.sql.cassandra").\
           options(keyspace="text_bids", table="bidswon").\
           save(mode="append")
  #sqlContext.cacheTable('wonbids')
  # wonbids.show()
 
  symptoms = wonbids.select(wonbids.id,wonbids.created_utc,explode(wonbids.symptomtags).alias('symptom'))
  symptoms.registerTempTable("symptoms")
  symptoms.write.format("org.apache.spark.sql.cassandra").\
         options(keyspace="text_bids", table="symptoms").\
         save(mode="append")
  # symptoms.show()

  conditions = wonbids.select(wonbids.id,wonbids.created_utc,explode(wonbids.conditiontags).alias('condition'))
  conditions.registerTempTable("conditions")
  conditions.write.format("org.apache.spark.sql.cassandra").\
         options(keyspace="text_bids", table="conditions").\
         save(mode="append")
  # conditions.show()
  
  # send back to master to process
  for w in wonbids.collect():
    event.Event('toES', {'id':w.id,'pharmatag':w.pharmatag,'price':w.price,'created_utc':w.created_utc,'symptomtags':w.symptomtags,'conditiontags':w.conditiontags})
  print(">>>> END CASS")
예제 #3
0
    def test_explode(self):
        from pyspark.sql.functions import explode, explode_outer, posexplode_outer
        d = [
            Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}),
            Row(a=1, intlist=[], mapfield={}),
            Row(a=1, intlist=None, mapfield=None),
        ]
        rdd = self.sc.parallelize(d)
        data = self.spark.createDataFrame(rdd)

        result = data.select(explode(data.intlist).alias("a")).select("a").collect()
        self.assertEqual(result[0][0], 1)
        self.assertEqual(result[1][0], 2)
        self.assertEqual(result[2][0], 3)

        result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
        self.assertEqual(result[0][0], "a")
        self.assertEqual(result[0][1], "b")

        result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()]
        self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)])

        result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()]
        self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)])

        result = [x[0] for x in data.select(explode_outer("intlist")).collect()]
        self.assertEqual(result, [1, 2, 3, None, None])

        result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()]
        self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
예제 #4
0
    def test_explode(self):
        from pyspark.sql.functions import explode
        d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]
        rdd = self.sc.parallelize(d)
        data = self.sqlCtx.createDataFrame(rdd)

        result = data.select(explode(data.intlist).alias("a")).select("a").collect()
        self.assertEqual(result[0][0], 1)
        self.assertEqual(result[1][0], 2)
        self.assertEqual(result[2][0], 3)

        result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect()
        self.assertEqual(result[0][0], "a")
        self.assertEqual(result[0][1], "b")
 def data(self):
     from pyspark.sql.functions import array, explode, col, lit
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
예제 #6
0
	def create_one_hot_dict(self,input_df):
		"""Creates a one-hot-encoder dictionary based on the input data.

		Args:
			input_df (DataFrame with 'features' column): A DataFrame where each row contains a list of
				(featureID, value) tuples.

		Returns:
			dict: A dictionary where the keys are (featureID, value) tuples and map to values that are
				unique integers.
		"""
		distinct_feats = input_df.select(explode(input_df.features)).distinct()
		#print distinct_feats.take(1)[0]
		return distinct_feats.rdd.map(lambda x: tuple(x[0])).zipWithIndex().collectAsMap()
예제 #7
0
def get_average_expected_change(timeframe, partner, purchase):
    """
    returns the average expected change of day 1 to day 7
    input: a period (string), a partner (string), a spark dataframe
    output: a dictionary with key=day and value=average_expected_change
    """
    keys = [1,2,3,4,5,6,7]
    result = {}
    result = {key: 0 for key in keys}
    timeframe_is = get_date(timeframe)
    unlisted = purchase.select(explode(purchase.prediction.days).alias("test")).collect()
    # print 'length of unlisted:', len(unlisted)
    total = len(unlisted) / len(keys)
    # print total
    for i in range(len(unlisted)):
        result[i%7+1] += unlisted[i][0].expected_change / total
    return result
#linesOut.saveAsTextFile("hdfs:///tmp/fact_icd9_encounter_08242016_supP2_rdd_.csv")

# linesOut.saveAsTextFile('hdfs:///tmp/fact_icd9_encounter_08242016_supP2_rdd.csv')

linesOut.saveAsTextFile("/Users/jayurbain/Dropbox/MCW/fact_icd9_encounter_08242016_supP2_rdd.txt")

linesOut.reduce( lambda k,v: (k))

################

from pyspark.sql import Row
from pyspark.sql.functions import explode

df = sqlContext.createDataFrame([Row(a=1, b=[1,2,3],c=[7,8,9]), Row(a=2, b=[4,5,6],c=[10,11,12])])
df1 = df.select(df.a,explode(df.b).alias("b"),df.c)
df2 = df1.select(df1.a,df1.b,explode(df1.c).alias("c"))



###################################

# fpgrowth example
itemsets = parts.map(lambda p: ( p[3].strip().split(',') ) )
itemsets.getNumPartitions()
model_fp = FPGrowth.train(itemsets, minSupport=0.005, numPartitions=10)
result = model_fp.freqItemsets().collect()
for i in sorted(result, key=operator.itemgetter(1), reverse=True):
    print '(', ', '.join(i.items), ')', 'freq=', str(i.freq)

예제 #9
0
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
예제 #10
0
from pyspark.sql import SparkSession  
from pyspark.sql.functions import split, explode, col  
  
spark = SparkSession.builder.appName("wordcount").getOrCreate()  
lines = spark.read.text("README.md")  
  
words = lines.select(explode(split(lines.value, ",")).alias("words"))  
  
words.withColumn('word', explode(split(col('words'), ' ')))\  
    .groupBy('word')\  
    .count()\  
    .sort('count', ascending=False)\  
    .show()
예제 #11
0
evaluator = MulticlassClassificationEvaluator()
acc = evaluator.evaluate(pred)


#####################################################################
# https://www.datacamp.com/courses/recommendation-engines-in-pyspark
############# Collaborative Filtering
mo.printSchema()
mo = mo.select(mo.UserId.cast('integer'), mo.MovieId.cast('integer'), mo.rating.cast('double'))
mo.show()

# coverting data into row-based dataframe
from pyspark.sql.functions import array, col, explode, lit, struct
def to_long(df, by = ["userId"]): # "by" is the column by which you want the final output dataframe to be grouped by
    cols = [c for c in df.columns if c not in by]
    kvs = explode(array([struct(lit(c).alias("movieId"), col(c).alias("rating")) for c in cols])).alias("kvs")
    long_df = df.select(by + [kvs]).select(by + ["kvs.movieId", "kvs.rating"]).filter("rating IS NOT NULL")
    # Excluding null ratings values since ALS in Pyspark doesn't want blank/null values
    return long_df

mo = to_long(mo)

# sparsity
n_rating = mo.select('rating').count()
n_users = mo.select('UserId').distinct().count()
n_movies = mo.select('MovieId').distinct().count()
sparsity = (1 - (n_rating * 1.0 / (n_users * n_movies)))*100
print("The dataframe is ", "%.2f" %sparsity + " % empty")

# preprocessing
from pyspark.sql.functions import monotonically_increasing_id
예제 #12
0
    host = sys.argv[1]
    port = int(sys.argv[2])

    spark = SparkSession\
        .builder\
        .appName("StructuredNetworkWordCount")\
        .getOrCreate()

    # Create DataFrame representing the stream of input lines from connection to host:port
    lines = spark\
        .readStream\
        .format('socket')\
        .option('host', host)\
        .option('port', port)\
        .load()

    # Split the lines into words
    words = lines.select(explode(split(lines.value, ' ')).alias('word'))

    # Generate running word count
    wordCounts = words.groupBy('word').count()

    # Start running the query that prints the running counts to the console
    query = wordCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .start()

    query.awaitTermination()
예제 #13
0
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(explode(split(shakespeareDF.value , ' ')).alias('word'))
                .filter("word<>''"))
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
예제 #14
0
        casSession = cascluster.connect('test2')
        for aggItem in agg:
            if aggItem[0] != "":
                    casSession.execute('INSERT INTO author_pub_simu (author, pub) VALUES (%s, %s)', (str(aggItem[0]), str(aggItem[1])))
        casSession.shutdown()
        cascluster.shutdown()

sc = SparkContext("spark://ip-172-31-2-40:7077", "2016_test")
sqlContext = SQLContext(sc)

# read in data from HDFS and select columns
df1 = sqlContext.read.json("hdfs://ec2-52-34-128-244.us-west-2.compute.amazonaws.com:9000//simulated/fake_data_p1*.json").dropna()
df_sel = df1.select('recid', 'authors','co-authors','references', 'creation_date').withColumnRenamed('co-authors', 'co_authors').persist(StorageLevel.MEMORY_AND_DISK)

# explode references list and group by citation id to calcualte the number of times that one publication has been cited
df_references = df_sel.select('recid', explode('references')).withColumnRenamed('_c0','cited_id').groupBy('cited_id').count().withColumnRenamed('count','num_cited')

# combine author and co-author list to generate a total list of authors and convert rdd into dataframe
rdd_authors = df_sel.rdd.map(lambda x:{'recid':x.recid, 'authors': append_author(x.authors, x.co_authors), 'creation_date': fetch_year(x.creation_date)})
df_authors = sqlContext.createDataFrame(rdd_authors)

# join citation and author dataframes
df_join = df_references.join(df_authors, df_references.cited_id == df_authors.recid, 'inner').drop(df_authors.recid)

# explode author and save to Cassandra database
df_explode_author = df_join.select('cited_id', 'num_cited', explode('authors'), 'creation_date').withColumnRenamed('_c0', 'author')
df_explode_author.persist(StorageLevel.MEMORY_AND_DISK)
df_sel.unpersist()
df_explode_author.rdd.foreachPartition(aggToCassandra2)

# combine each author publication list,  group by author and calculate H-index for each author
예제 #15
0
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# create dataframe representing the stream of input lines from connection to localhost:9999
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# split the lines into words:

words = lines.select(explode(split(
    lines.value,
    " ")).alias("word"))  # word is the new column name created with alias

# generate running word count:

wordCounts = words.groupBy('word').count()

# we have now set up the query on the streaming data. All that is left is to actually start receiving data and computing counts
# To do this, we set it up to print the complete set of counts (specified by outputMode('complete')) to the console every time they are updated
# and then start the streaming computation using start().

# the lines dataframe is the input table, and wordCounts dataframe is the result table

# Start running the query that prints the running counts to the console

# open a terminal and type nc -lk 9999
예제 #16
0
    def main(self, sc, *args):
        spark = SparkSession(sc)
        observations_parquet_path = args[0]
        gene_parquet_path = args[1]
        stats_parquet_path = args[2]
        ontology_metadata_parquet_path = args[3]
        orthologe_parquet_path = args[4]
        output_path = args[5]

        stats_df = spark.read.parquet(stats_parquet_path)
        observations_df = spark.read.parquet(observations_parquet_path)
        gene_df = spark.read.parquet(gene_parquet_path)
        ontology_df = spark.read.parquet(ontology_metadata_parquet_path)
        orthologe_df = spark.read.parquet(orthologe_parquet_path)

        group_by_cols = [
            "gene_symbol",
            "gene_accession_id",
            "allele_symbol",
            "allele_accession_id",
            "life_stage_name",
            "zygosity",
            "strain_name",
            "strain_accession_id",
        ]

        grouped_stats_cols = [
            "mp_term_id",
            "top_level_mp_term_id",
        ]

        stats_df = stats_df.withColumnRenamed("marker_symbol", "gene_symbol")
        stats_df = stats_df.withColumnRenamed(
            "marker_accession_id", "gene_accession_id"
        )
        stats_df = stats_df.withColumn("life_stage_name", explode("life_stage_name"))

        batch_query_df = (
            stats_df.where(col("significant"))
            .groupBy(*group_by_cols)
            .agg(
                *[
                    collect_set(col_name).alias(col_name)
                    for col_name in grouped_stats_cols
                ]
            )
        )

        grouped_obs_cols = [
            "procedure_stable_id",
            "procedure_name",
            "parameter_stable_id",
            "parameter_name",
        ]

        experiment_data = observations_df.groupBy(*group_by_cols).agg(
            *[collect_set(col_name).alias(col_name) for col_name in grouped_obs_cols]
        )

        batch_query_df = batch_query_df.join(
            experiment_data, group_by_cols, "left_outer"
        )

        gene_df = gene_df.select(
            col("mgi_accession_id").alias("gene_accession_id"),
            "ensembl_gene_id",
            "assignment_status",
            "conditional_allele_production_status",
            "es_cell_production_status",
            "mouse_production_status",
            "phenotype_status",
        )

        batch_query_df = batch_query_df.join(gene_df, "gene_accession_id", "left_outer")

        grouped_orth_cols = ["hg_hgnc_acc_id", "hg_symbol"]

        orthologe_df = orthologe_df.withColumnRenamed(
            "mg_mgi_gene_acc_id", "gene_accession_id"
        )
        orthologe_df = orthologe_df.where(
            (col("o_is_max_human_to_mouse") == "max")
            & (col("o_is_max_mouse_to_human") == "max")
            & (col("mmf_category_for_threshold") == "one-to-one")
            & (col("hmf_category_for_threshold") == "one-to-one")
            & (col("o_support_count") >= 5)
        ).select("gene_accession_id", "hg_hgnc_acc_id", "hg_symbol")
        # Remove the ones that have more than one orthologue mmf_category_for_threshold=one-to-one hmf_category_for_threshold=one-to-one
        orthologe_df = orthologe_df.groupBy("gene_accession_id").agg(
            *[collect_set(col_name).alias(col_name) for col_name in grouped_orth_cols]
        )

        batch_query_df = batch_query_df.join(
            orthologe_df, "gene_accession_id", "left_outer"
        )

        batch_query_df.write.parquet(output_path)
예제 #17
0
sc = SparkContext()
spark = SparkSession \
	.builder \
	.appName("Python Spark SQL basic example") \
	.config("spark.some.config.option", "some-value") \
	.getOrCreate()

path = sys.argv[1]
df = spark.read.option("multiline", "true").json(path)

name = df.select("meta.view.name").collect()
table_id = df.select("meta.view.id").collect()
descr = df.select("meta.view.description").collect()
category = df.select("meta.view.category").collect()

data = df.select("data").collect()
table_size = len(data)

t = pd.DataFrame({'ID':table_id,'Name': name,'Description':descr,'Category':category,'Size':table_size})
schema_list = df.select(explode("meta.view.columns").alias("col")).select("col.name").collect()

schema_l = list()
for i in schema_list:
	schema_l.append(sc.parallelize(i).take(1)[0])

fields = [StructField(field_name, StringType(), True) for field_name in schema_l]
schema = StructType(fields)
raw_data = spark.createDataFrame(data[0][0],schema)
#print(len(data[0][0][0]))
raw_data.show()
예제 #18
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pprint import pprint

if __name__ == "__main__":

    session = SparkSession.builder.appName("Payload").getOrCreate()

    dataFrameReader = session.read

    responses = dataFrameReader \
        .option("header", "true") \
        .option("inferSchema", value = True) \
        .json("payload/payload500.json")

    print("=== Print out schema ===")
    responses.printSchema()
    pprint(responses.columns)
    df = responses.select(explode('events'))
    df.show()
    #pprint(df.collect())
    #responses.show()

    session.stop()
	#Convert text_entry to Lower Case
   	columnName="text_entry"
   	df2 = df2.withColumn(columnName, lower(col(columnName)))
   	
   	#Remove punctuations from text_entry
	df2 = df2.withColumn(columnName, regexp_replace(col(columnName), '[^\sa-zA-Z0-9]', ''))

	#Drop extra columns from df2
	df2 = df2.drop('line_id', 'line_number', 'play_name', 'speaker', 'speech_number', 'type')

	#Split text_entry column into words by using the split function
	df2 = df2.withColumn("text_entry", split("text_entry", " "))

	#Explode eachtext_entry value into multiple rows to get _id with each word of text_entry
	df2 = df2.withColumn("token", explode(col("text_entry")))

	#Calculating Term Frequency by grouping based on ‘_id’ and ‘token’ and counting how many times each token occurs in each document
	df_tf = df2.groupby("_id", "token").agg(F.count("text_entry").alias("tf"))

	#Calculating Document Frequency by grouping on each token and counting the number of documents it occurs in
	df_idf = df2.groupby("token").agg(F.countDistinct("_id").alias("df"))

	#Converting ‘df’ column to Double Type in order for easy calculation later on
	df_idf = df_idf.withColumn("df", df_idf["df"].cast(DoubleType()))

	#Calculating IDF values
	df_idf = df_idf.withColumn("idf", F.log10(N/df_idf["df"]))

	#Joining df_tf and df_idf based on token columns
	tokensWithTfIdf = df_tf.join(df_idf, df_tf["token"] == df_idf["token"], how='left').drop(df_idf["token"])
예제 #20
0
# MAGIC %md
# MAGIC ### Comments

# COMMAND ----------

# MAGIC %md
# MAGIC count:long
# MAGIC items:array
# MAGIC total_count:long

# COMMAND ----------

from pyspark.sql.functions import explode, when
df_comments = df.select(df.comments.count.alias('comments_count'),
                        explode(df.comments.items),
                        df.comments.total_count.alias('total_count'),
                        df.checkin_id)
print(df_comments.columns)

# COMMAND ----------

for col in df_comments_flattened.columns:
    splits = col.split('col_')
    name = splits[len(splits) - 1]
    df_comments_flattened = df_comments_flattened.withColumnRenamed(col, name)
display(df_comments_flattened)

# COMMAND ----------

create_register_delta_table(df_comments_flattened, 'comments',
예제 #21
0
userSchema = StructType().add("ID", "string").add("Lang", "string").add(
    "Date", "string").add("Source", "string").add("len", "integer").add(
        "likes", "integer").add("RT", "integer").add("Hash", "string").add(
            "UserN",
            "string").add("UserID", "string").add("Names", "string").add(
                "Place", "string").add("Follow",
                                       "integer").add("Friend", "integer")
csvdf=spark \
 .readStream \
 .option("sep",";") \
 .schema(userSchema) \
 .csv("hdfs://localhost:9000/stream")

#print(type(csvdf))

tags = csvdf.select("Hash")
#.rdd.flatmap(lambda x:(x.split(','),1))
#print(type(tags))
words = tags.select(explode(split("Hash", ",")).alias("splithash"))

wordcount = words.groupby("splithash").count()

most = wordcount.select("splithash", "count").orderBy("count",
                                                      ascending=False).limit(5)

#query1=tags.collect()
query = most.writeStream.outputMode("complete").format("console").start()

query.awaitTermination(60)
query.stop()
예제 #22
0
 def with_explode_column(df):
     import pyspark.sql.functions as F
     df2 = df.withColumn('values', F.array(F.lit(1), F.lit(2)))
     df2 = df2.withColumn('value', F.explode(df2.values))
     return df2
예제 #23
0
import base64
import re

# ********************************CONFIGURATION**********************************

end = cfg.end_time
start = cfg.start_time

# HDFS path to the impalaQueries_<date>.json file created in step 1
hdfsPath = cfg.hdfs_path + "jsons_" + str(start.month) + "-" + str(start.day) + "_to_" + str(end.month) + "-" + str(end.day) + "/"

# *******************************************************************************

# Read in the JSON data:
spark = SparkSession.builder.getOrCreate()
df = spark.read.json(hdfsPath).withColumn("queries", explode("queries")).select("queries.*")

# Add any missing top-level columns and then select them:
raw_columns = ['coordinator', 'database', 'detailsAvailable', 'durationMillis', 'endTime', 'queryId', 'querystate', 'queryType', 'rowsProduced', 'startTime', 'statement', 'user']
for i in range(len(raw_columns)):
    if raw_columns[i] not in df.columns:
        df = df.withColumn(raw_columns[i], lit(None).cast(StringType()))
df = df.selectExpr('attributes.*', *raw_columns)

# Add any missing attribute-level columns and then select them:
attribute_columns = ['admission_result','admission_wait','bytes_streamed','client_fetch_wait_time','client_fetch_wait_time_percentage','connected_user','ddl_type','delegated_user','estimated_per_node_peak_memory','file_formats','hdfs_average_scan_range','hdfs_bytes_read','hdfs_bytes_read_from_cache','hdfs_bytes_read_from_cache_percentage','hdfs_bytes_read_local','hdfs_bytes_read_local_percentage','hdfs_bytes_read_remote','hdfs_bytes_read_remote_percentage','hdfs_bytes_read_short_circuit','hdfs_bytes_read_short_circuit_percentage','hdfs_bytes_skipped','hdfs_bytes_written','hdfs_scanner_average_bytes_read_per_second','impala_version','memory_accrual','memory_aggregate_peak','memory_per_node_peak','memory_per_node_peak_node','memory_spilled','network_address','oom','original_user','planning_wait_time','planning_wait_time_percentage','pool','query_status','rows_inserted','session_id','session_type','stats_corrupt','stats_missing','thread_cpu_time','thread_cpu_time_percentage','thread_network_receive_wait_time','thread_network_receive_wait_time_percentage','thread_network_send_wait_time','thread_network_send_wait_time_percentage','thread_storage_wait_time','thread_storage_wait_time_percentage','thread_total_time']
for i in range(len(attribute_columns)):
    if attribute_columns[i] not in df.columns:
        df = df.withColumn(attribute_columns[i], lit(None).cast(StringType()))
df = df.withColumn("statement", regexp_replace("statement", "\\s+"," "))
df = df.selectExpr("admission_result", "cast(admission_wait as int)", "cast(bytes_streamed as bigint)", "cast(client_fetch_wait_time as int)", "cast(client_fetch_wait_time_percentage as tinyint)", "connected_user", "ddl_type", "delegated_user", "cast(estimated_per_node_peak_memory as bigint)", "file_formats", "cast(hdfs_average_scan_range as float)", "cast(hdfs_bytes_read as bigint)", "cast(hdfs_bytes_read_from_cache as tinyint)", "cast(hdfs_bytes_read_from_cache_percentage as tinyint)", "cast(hdfs_bytes_read_local as bigint)", "cast(hdfs_bytes_read_local_percentage as tinyint)", "cast(hdfs_bytes_read_remote as bigint)", "cast(hdfs_bytes_read_remote_percentage as tinyint)", "cast(hdfs_bytes_read_short_circuit as bigint)", "cast(hdfs_bytes_read_short_circuit_percentage as tinyint)", "cast(hdfs_bytes_skipped as int)", "cast(hdfs_bytes_written as int)", "cast(hdfs_scanner_average_bytes_read_per_second as float)", "cast(memory_accrual as float)", "cast(memory_aggregate_peak as float)", "cast(memory_per_node_peak as float)", "memory_per_node_peak_node", "cast(memory_spilled as bigint)", "network_address", "cast(oom as boolean)", "original_user", "cast(planning_wait_time as smallint)", "cast(planning_wait_time_percentage as smallint)", "pool", "query_status", "cast(rows_inserted as int)", "session_id ", "session_type", "cast(stats_corrupt as boolean)", "cast(stats_missing as boolean)", "cast(thread_cpu_time as int)", "cast(thread_cpu_time_percentage as tinyint)", "cast(thread_network_receive_wait_time as int)", "cast(thread_network_receive_wait_time_percentage as tinyint)", "cast(thread_network_send_wait_time as int)", "cast(thread_network_send_wait_time_percentage as tinyint)", "cast(thread_storage_wait_time as int)", "cast(thread_storage_wait_time_percentage as tinyint)", "cast(thread_total_time as int)", "coordinator.hostid as hostid", "database", "cast(durationmillis as bigint)", "endtime", "queryid ", "querystate", "querytype", "cast(rowsproduced as bigint)", "starttime", "statement", "user")
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName("WordCount").getOrCreate()

dataFrame = spark.read.text("file:///SparkCourse/book.txt")

words = dataFrame.select(func.explode(func.split(dataFrame.value,"\\W++")).alias("word"))
words.filter(words.word != "")

words = words.select(func.lower(words.word).alias("word"))

words = words.groupBy("word").count()

wordsSource = words.sort("count")

wordsSource.show(wordsSource.count())

예제 #25
0
    s = io.BytesIO(binary)
    r = sr.Recognizer()
    with sr.AudioFile(s) as source:
        audio = r.record(source)
    try:
        print("Transcribing...")
        text = r.recognize_sphinx(audio)
        print("Done!")
        return text
    except:
        msg = "no_transcription_available"
        print("Darn! Could not transcribe audio.")
        return msg

sttudf = udf(lambda z:recognize(z), StringType())
splitudf = udf(lambda x: splitWav(x), ArrayType(BinaryType()))
convertudf = udf(lambda x: convertToWav(x), BinaryType())

df = spark.read.format("binaryFile").option("pathGlobFilter", "DTNS*.mp3").option("recursiveFileLookup", "true").load("s3a://jordan-podcast-s3/")
df = df.withColumn("WAVAudio", convertudf(df.content)).drop("modificationTime","length","content")
df = df.withColumn("splitwavs", splitudf(df.WAVAudio)).drop("WAVAudio")
df = df.withColumn("splitwavs", explode(df.splitwavs))
df = df.repartition(36)
df = df.withColumn("transcriptions", sttudf(df.splitwavs)).drop("splitwavs")
df = df.groupby("path").agg(collect_list('transcriptions').alias("transcriptions"))
df = df.withColumn("transcriptions", concat_ws(" ", "transcriptions"))
df.write.format('org.elasticsearch.spark.sql')\
        .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\
        .option('es.port', 9200)\
        .option('es.resource', "podcast2/test")\
        .save()
예제 #26
0
                      StructField("Len", StringType(), True),
                      StructField("Likes", StringType(), True),
                      StructField("RTs", StringType(), True),
                      StructField("Hashtags", StringType(), True),
                      StructField("UserMentionNames", StringType(), True),
                      StructField("UserMentionID", StringType(), True),
                      StructField("name", StringType(), True),
                      StructField("Place", StringType(), True),
                      StructField("Followers", StringType(), True),
                      StructField("Friends", StringType(), True)])


inputDF = spark.readStream.schema(schema).option("delimiter",";").option("maxFilesPerTrigger",1).csv(inputPath)




query1 =inputDF.select(explode(split("Hashtags", ",")).alias("Hashtags")).groupBy("Hashtags").count().orderBy('count', ascending=False)

   

#query2 = inputDF.withColumn("Ratio",inputDF.Followers/inputDF.Friends).groupBy('name').agg(max('Ratio').alias('FRRatio')).orderBy('FRRatio', ascending=False)


query1.writeStream.outputMode("complete").format("console").option("numRows",5).start().awaitTermination(100)

    
#query2.writeStream.outputMode("complete").format("console").option("numRows",1).start().awaitTermination(100)


예제 #27
0
 def data(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i) for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))).drop('vs')
예제 #28
0
파일: sampling.py 프로젝트: kdhingra307/ncm
def sample(df, seed=None, samples_per_wiki=1000000):
    """Choose a representative sample of queries from input dataframe.

    Takes in the unsampled query click logs and filters it down into a smaller
    representative sample that can be run through the machine learning
    pipeline. Note that when using data in the `discovery.query_clicks_daily`
    table the query needs to be post-processed to normalize the queries for
    grouping.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe with columns wikiid, query, and session_id.
    seed : int or None, optional
        The random seed used when sampling. If None a seed will be chosen
        randomly. (Default: None)
    samples_per_wiki : int, optional
        The desired number of distinct (query, hit_page_id) pairs in the
        output. This constraint is approximate and the returned number
        of queries may vary per wiki. (Default: 1000000)

    Returns
    -------
    pyspark.sql.DataFrame
        The input DataFrame with all columns it origionally had sampled down
        based on the provided constraints.
    """
    mjolnir.spark.assert_columns(df, ['wikiid', 'query', 'hit_page_ids', 'norm_query_id', 'session_id'])

    # We need this df twice, and by default the df coming in here is from
    # mjolnir.norm_query which is quite expensive.
    df.cache()

    # Figure out the percentage of each wiki's norm_query_id's we need to approximately
    # have samples_per_wiki final training samples.
    hit_page_id_counts = (
        df
        .select('wikiid', 'query', F.explode('hit_page_ids').alias('hit_page_id'))
        # We could groupBy('wikiid').agg(F.countDistinct('query', 'hit_page_id'))
        # directly, but this causes spark to blow out memory limits by
        # collecting too much data on too few executors.
        .groupBy('wikiid', 'query')
        .agg(F.countDistinct('hit_page_id').alias('num_hit_page_ids'))
        .groupBy('wikiid')
        .agg(F.sum('num_hit_page_ids').alias('num_hit_page_ids'))
        .collect())

    hit_page_id_counts = {row.wikiid: row.num_hit_page_ids for row in hit_page_id_counts}

    wiki_percents = {}
    needs_sampling = False

    for wikiid, num_hit_page_ids in hit_page_id_counts.items():
        wiki_percents[wikiid] = min(1., float(samples_per_wiki) / num_hit_page_ids)
        if wiki_percents[wikiid] < 1.:
            needs_sampling = True

    if not needs_sampling:
        return hit_page_id_counts, df

    # Aggregate down into a unique set of (wikiid, norm_query_id) and add in a
    # count of the number of unique sessions per pair. We will sample per-strata
    # based on percentiles of num_sessions.
    df_queries_unique = (
        df
        .groupBy('wikiid', 'norm_query_id')
        .agg(F.countDistinct('session_id').alias('num_sessions'))
        # This rdd will be used multiple times through strata generation and
        # sampling. Cache to not duplicate the filtering and aggregation work.
        .cache())

    df_queries_sampled = _sample_queries(df_queries_unique, wiki_percents, seed=seed)

    # Select the rows chosen by sampling from the input df
    df_sampled = (
        df
        .join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id'])
        .cache())

    return hit_page_id_counts, df_sampled
예제 #29
0
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName('WordCount').getOrCreate()

inputDF = spark.read.text('book.txt')

words = inputDF.select(func.explode(func.split(inputDF.value, '\\W+')).alias('word'))
words.filter(words.word != '')

lowerCaseWords = words.select(func.lower(words.word).alias('word'))

wordCount = lowerCaseWords.groupBy('word').count()

wordCountSorted = wordCount.sort('count')

wordCountSorted.show(wordCountSorted.count())

spark.stop()
예제 #30
0
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode, split, regexp_replace
import re

spark = SparkSession.builder.master("local").appName(
    "Word Count").getOrCreate()
sc = spark.sparkContext
df = spark.createDataFrame([("<#spark><java>", 1),
                            ("<ab.c><p++ython><p.hp>", 2), ("<a>", 3)],
                           ["Tags", "UserId"])
df.show()
df = df.withColumn("Tags", explode(split("Tags", "(?=<)|(?<=>)")))
df = df.withColumn("Tags", regexp_replace("Tags", r'\<|\>', ""))
df.filter(df.Tags != "").show()


def split_tags(row):
    owner_user_id = row.UserId
    tags = row.Tags
    result = re.findall("<(.*?)>", tags)
    list = []
    for res in result:
        list.append([owner_user_id, res])
    print(list)
    return spark.parallelize(list)


# df.rdd.map(split_tags).toDF().show()
예제 #31
0
sentenceDF.show(truncate=False)
(sentenceDF
 .select(removePunctuation(col('sentence')))
 .show(truncate=False))
 
 # loading text file 
 
 fileName = ""

shakespeareDF = sqlContext.read.text(fileName).select(removePunctuation(col('value')))
shakespeareDF.show(15, truncate=False)

# sample for splitting lines to words and making words as seperate rows in data frames

from pyspark.sql.functions import split, explode
shakeWordsDF1 = shakespeareDF.select(explode(split(shakespeareDF.sentence,' ')).alias('word'))
shakeWordsDF = shakeWordsDF1.where(shakeWordsDF1.word !="")
shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc("count"))
topWordsAndCountsDF.show()

# sample for creating collection of few records 
from faker import Factory
fake = Factory.create()
fake.seed(4321)
from pyspark.sql import Row
def fake_entry():
예제 #32
0
    .getOrCreate()
# Create DataFrame representing the stream of input lines from connection to host:port
data_schema = [
    StructField("ID", StringType(), True),
    StructField("Language", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Length", IntegerType(), True),
    StructField("Likes", IntegerType(), True),
    StructField("Retweets", IntegerType(), True),
    StructField("Hashtags", StringType(), True),
    StructField("UserMentionNames", StringType(), True),
    StructField("UserMentionID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Place", StringType(), True),
    StructField("Followers", IntegerType(), True),
    StructField("Friends", IntegerType(), True)
]
finalschema = StructType(fields=data_schema)
v = spark \
    .readStream \
    .format("csv") \
    .option("sep", ";") \
    .schema(finalschema) \
    .load("hdfs://localhost:9000/stream")
v = v.select((explode(split("Hashtags", ","))).alias("Hashtags"))
v = v.groupby("Hashtags").count()
v.createOrReplaceTempView("twitter")
top = spark.sql("SELECT Hashtags, count FROM twitter ORDER BY 2 DESC LIMIT 5")
finalquery(top)
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
# shakeWordsDF = (shakespeareDF
#                 .select(shakespeareDF[0].alias('sentence'))
#                ) 
shakeWordsDF = (shakespeareDF
                .select(split(shakespeareDF[0], " ").alias('wordLst'))
               )
shakeWordsDF = (shakeWordsDF
                .select(explode(shakeWordsDF.wordLst).alias('word'))
                .where("word != ''")
               )

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
예제 #34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--docs_path', default='data/wiki-sample/AA')
    parser.add_argument('-p', '--prepro_path', default='data/prepro')
    parser.add_argument('-q',
                        '--queries_path',
                        default='data/queries/sample.json')
    parser.add_argument('-o', '--output_path', default='data/output')
    parser.add_argument('-m',
                        '--mode',
                        choices=['prepro', 'fit', 'query'],
                        default='prepro')
    parser.add_argument('-dl', '--docs_limit', type=int)
    parser.add_argument('-ql', '--queries_limit', type=int)
    parser.add_argument('-il',
                        '--inverted_index_limit',
                        type=int,
                        default=5000)
    args = parser.parse_args()
    print('Running BigramPipeline with args: {}'.format(args))

    spark = SparkSession.builder.appName('BigramModel').getOrCreate()

    tokenIdsUdf = udf(lambda x: x.indices.tolist(), ArrayType(IntegerType()))
    tfIdfModelPath = os.path.join(args.prepro_path, 'tf_idf_model')
    docsTfIdfPath = os.path.join(args.prepro_path, 'docs_tf_idf')
    docsTokenIdsPath = os.path.join(args.prepro_path, 'docs_token_ids')
    docsBigramsPath = os.path.join(args.prepro_path, 'docs_bigrams')

    parser = WikiParser(inputCol='text',
                        outputCol='text_parsed',
                        minParagraphs=1,
                        minCharacters=500)
    tokenizer = Tokenizer(inputCol='text_parsed', outputCol='unigrams')
    ngrams = NGram(inputCol='unigrams', outputCol='bigrams', n=2)
    concat = Concat(inputCols=['unigrams', 'bigrams'], outputCol='tokens')

    if args.mode == 'prepro':
        spark.sparkContext.setJobGroup('input', 'Read input data')
        docs = spark.read.json(args.docs_path)
        if args.docs_limit is not None:
            docs = docs.limit(args.docs_limit)

        spark.sparkContext.setJobGroup('parse_docs', 'Parse wiki documents')
        docsParsed = parser.transform(docs)
        docsParsed = checkpoint(spark, docsParsed,
                                os.path.join(args.prepro_path, 'docs_parsed'))

        spark.sparkContext.setJobGroup('tokenize', 'Tokenize documents')
        docsTokenized = tokenizer.transform(docsParsed)
        docsTokenized = checkpoint(
            spark, docsTokenized,
            os.path.join(args.prepro_path, 'docs_tokenized'))

        spark.sparkContext.setJobGroup('ngrams', 'Compute bigrams')
        docsBigrams = ngrams.transform(docsTokenized)
        docsBigrams = concat.transform(docsBigrams)
        docsBigrams.write.parquet(docsBigramsPath)
    elif args.mode == 'fit':
        spark.sparkContext.setJobGroup('input', 'Read input data')
        docsBigrams = spark.read.parquet(docsBigramsPath).select(
            'id', 'tokens')
        tf = CountVectorizer(inputCol='tokens',
                             outputCol='tf',
                             vocabSize=10000000,
                             minDF=2.0,
                             minTF=3.0)
        idf = IDF(inputCol='tf', outputCol='idf')

        spark.sparkContext.setJobGroup('tf', 'Fit TF model')
        tfModel = tf.fit(docsBigrams)
        docsTf = tfModel.transform(docsBigrams)
        docsTf = checkpoint(spark, docsTf,
                            os.path.join(args.prepro_path, 'docs_tf'))

        spark.sparkContext.setJobGroup('idf', 'Fit IDF model')
        idfModel = idf.fit(docsTf)
        docsTfIdf = idfModel.transform(docsTf)
        docsTfIdf = docsTfIdf.select(docsTfIdf.id.alias('doc_id'),
                                     docsTfIdf.idf.alias('doc_idf'))
        docsTfIdf = checkpoint(spark, docsTfIdf, docsTfIdfPath)
        tfIdfModel = PipelineModel(
            stages=[tokenizer, ngrams, concat, tfModel, idfModel])
        tfIdfModel.save(tfIdfModelPath)

        spark.sparkContext.setJobGroup('docs_token_ids',
                                       'Compute inverted index')
        docsTokenIds = docsTfIdf.select(
            docsTfIdf.doc_id,
            explode(tokenIdsUdf(docsTfIdf.doc_idf)).alias('token_id'))
        docsTokenIds.write.parquet(docsTokenIdsPath)
    elif args.mode == 'query':
        assert args.queries_path is not None

        spark.sparkContext.setJobGroup('input', 'Read input data')
        tfIdfModel = PipelineModel.load(tfIdfModelPath)
        docsTfIdf = spark.read.parquet(docsTfIdfPath)
        docsTokenIds = spark.read.parquet(docsTokenIdsPath)
        queries = spark.read.json(args.queries_path)
        if args.queries_limit is not None:
            queries = queries.limit(args.queries_limit)
        queries = queries.select(queries._id.alias('query_id'),
                                 queries.question.alias('text_parsed'))

        spark.sparkContext.setJobGroup('queries_tf_idf',
                                       'Apply TF-IDF to queries')
        queriesTfIdf = tfIdfModel.transform(queries)
        queriesTfIdf = queriesTfIdf.select(queriesTfIdf.query_id,
                                           queriesTfIdf.tf.alias('query_tf'))
        queriesTfIdf = checkpoint(
            spark, queriesTfIdf,
            os.path.join(args.output_path, 'queries_tf_idf'))
        print('Finished query TF IDF')

        spark.sparkContext.setJobGroup('queries_token_ids',
                                       'Compute query token IDs')
        queriesTokenIds = queriesTfIdf.select(
            queriesTfIdf.query_id,
            explode(tokenIdsUdf(queriesTfIdf.query_tf)).alias('token_id'))
        queriesTokenIds = checkpoint(
            spark, queriesTokenIds,
            os.path.join(args.output_path, 'queries_token_ids'))
        print('Finished query token IDs')

        spark.sparkContext.setJobGroup('doc_queries',
                                       'Perform inverted index filtering')
        docQueries = docsTokenIds.join(queriesTokenIds, on='token_id').groupby(
            'query_id', 'doc_id').count()
        window = Window.partitionBy(docQueries.query_id).orderBy(
            col('count').desc())
        docQueries = docQueries.withColumn('rank', row_number().over(window)) \
                        .filter(col('rank') <= args.inverted_index_limit) \
                        .select('query_id', 'doc_id')
        docQueries = checkpoint(spark, docQueries,
                                os.path.join(args.output_path, 'doc_queries'))
        print('Finished inverted index filter')

        spark.sparkContext.setJobGroup('score', 'Perform scoring')
        docQueries = docQueries.join(docsTfIdf, on='doc_id').join(queriesTfIdf, on='query_id') \
                        .select('query_id', 'doc_id', 'query_tf', 'doc_idf')
        docQueries = Dot(inputCols=['doc_idf', 'query_tf'],
                         outputCol='score').transform(docQueries)
        queryResults = docQueries.select('query_id', 'doc_id', 'score')
        queryResults.write.parquet(
            os.path.join(args.output_path, 'query_results'))
        print('Wrote output to {}'.format(args.output_path))

    spark.stop()
예제 #35
0
파일: wc.py 프로젝트: cottrell/notebooks
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

output_mode = "complete"
# output_mode = "append"

# Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode(output_mode) \
    .format("console") \
    .start()
예제 #36
0
data = data.withColumn("subject", regexp_replace("subject", "\s+", ""))
# data = data.withColumn("subject", regexp_replace("subject"," ",""))

# Question 1 Answer
# Get the count of the emails
data.select("location").distinct().count()

text_file = open("Q1.txt", "w")
text_file.write("%s" % int(data.select("location").distinct().count()))
text_file.close()

# Verified by calculating the number of files from os directory

data = data.select(
    "*",
    (explode(split(col("to"), ",")).alias("to_clean"))).where('to_clean != ""')
data = data.withColumn("to_clean", regexp_replace("to_clean", "\s+", ""))
data = data.withColumn("to_clean", regexp_replace("to_clean", " ", ""))

# Question 2
# Here we explode to to column as it is a comma separated list and then perform
data_q2 = data.groupBy('to_clean').agg(
    count("message").alias("total_messages")).orderBy("total_messages",
                                                      ascending=False)

data_q2 = data_q2.toPandas()
data_q2.to_csv("Q2.txt", index=False, sep="\t")
# Verfied by looking at the volume of message for a person they seem to be inline

# Question 3:
# Here we do a self join and try to match the from and to variable and then match the subject with "Re:" Removed to denote response
예제 #37
0
# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                <FILL IN>)

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import split, size, explode
shakeWordsDF = (shakespeareDF
                .select(split('sentence', '\s+').alias('words'))
                .select(explode('words').alias('word'))
                .where(col('word') != ''))

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# PRIVATE_TEST Remove empty elements (4d)
예제 #38
0
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (
  shakespeareDF.select(explode(split(shakespeareDF.sentence, " ")))
  .where("col != ''")
  .selectExpr("col as word"))

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(explode(split(shakespeareDF.sentence, ' '))
                .alias("word"))
                .where("word != ''"))

shakeWordsDF.show(truncate=False)
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount


# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
    def rdd_to_recordstore(rdd_transform_context_rdd):

        if rdd_transform_context_rdd.isEmpty():
            MonMetricsKafkaProcessor.log_debug(
                "rdd_to_recordstore: nothing to process...")
        else:

            sql_context = SQLContext(rdd_transform_context_rdd.context)
            data_driven_specs_repo = DataDrivenSpecsRepoFactory.\
                get_data_driven_specs_repo()
            pre_transform_specs_df = data_driven_specs_repo.\
                get_data_driven_specs(
                    sql_context=sql_context,
                    data_driven_spec_type=DataDrivenSpecsRepo.
                    pre_transform_specs_type)

            #
            # extract second column containing raw metric data
            #
            raw_mon_metrics = rdd_transform_context_rdd.map(
                lambda nt: nt.rdd_info[1])

            #
            # convert raw metric data rdd to dataframe rdd
            #
            raw_mon_metrics_df = \
                MonMetricUtils.create_mon_metrics_df_from_json_rdd(
                    sql_context,
                    raw_mon_metrics)

            #
            # filter out unwanted metrics and keep metrics we are interested in
            #
            cond = [
                raw_mon_metrics_df.metric.name ==
                pre_transform_specs_df.event_type]
            filtered_metrics_df = raw_mon_metrics_df.join(
                pre_transform_specs_df, cond)

            #
            # validate filtered metrics to check if required fields
            # are present and not empty
            # In order to be able to apply filter function had to convert
            # data frame rdd to normal rdd. After validation the rdd is
            # converted back to dataframe rdd
            #
            # FIXME: find a way to apply filter function on dataframe rdd data
            validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter(
                MonMetricsKafkaProcessor._validate_raw_mon_metrics)
            validated_mon_metrics_df = sql_context.createDataFrame(
                validated_mon_metrics_rdd, filtered_metrics_df.schema)

            #
            # record generator
            # generate a new intermediate metric record if a given metric
            # metric_id_list, in pre_transform_specs table has several
            # intermediate metrics defined.
            # intermediate metrics are used as a convenient way to
            # process (aggregated) metric in mutiple ways by making a copy
            # of the source data for each processing
            #
            gen_mon_metrics_df = validated_mon_metrics_df.select(
                validated_mon_metrics_df.meta,
                validated_mon_metrics_df.metric,
                validated_mon_metrics_df.event_processing_params,
                validated_mon_metrics_df.event_type,
                explode(validated_mon_metrics_df.metric_id_list).alias(
                    "this_metric_id"),
                validated_mon_metrics_df.service_id)

            #
            # transform metrics data to record_store format
            # record store format is the common format which will serve as
            # source to aggregation processing.
            # converting the metric to common standard format helps in writing
            # generic aggregation routines driven by configuration parameters
            #  and can be reused
            #
            record_store_df = gen_mon_metrics_df.select(
                (gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_unix"),
                from_unixtime(
                    gen_mon_metrics_df.metric.timestamp / 1000).alias(
                    "event_timestamp_string"),
                gen_mon_metrics_df.event_type.alias("event_type"),
                gen_mon_metrics_df.event_type.alias("event_quantity_name"),
                (gen_mon_metrics_df.metric.value / 1.0).alias(
                    "event_quantity"),
                when(gen_mon_metrics_df.metric.dimensions.state != '',
                     gen_mon_metrics_df.metric.dimensions.state).otherwise(
                    'NA').alias("event_status"),
                lit('1.0').alias('event_version'),
                lit('metrics').alias("record_type"),

                # resource_uuid
                when(gen_mon_metrics_df.metric.dimensions.instanceId != '',
                     gen_mon_metrics_df.metric.dimensions.instanceId).when(
                    gen_mon_metrics_df.metric.dimensions.resource_id != '',
                    gen_mon_metrics_df.metric.dimensions.resource_id).
                otherwise('NA').alias("resource_uuid"),

                when(gen_mon_metrics_df.metric.dimensions.tenantId != '',
                     gen_mon_metrics_df.metric.dimensions.tenantId).when(
                    gen_mon_metrics_df.metric.dimensions.tenant_id != '',
                    gen_mon_metrics_df.metric.dimensions.tenant_id).when(
                    gen_mon_metrics_df.metric.dimensions.project_id != '',
                    gen_mon_metrics_df.metric.dimensions.project_id).otherwise(
                    'NA').alias("tenant_id"),

                when(gen_mon_metrics_df.metric.dimensions.mount != '',
                     gen_mon_metrics_df.metric.dimensions.mount).otherwise(
                    'NA').alias("mount"),

                when(gen_mon_metrics_df.metric.dimensions.device != '',
                     gen_mon_metrics_df.metric.dimensions.device).otherwise(
                    'NA').alias("device"),

                when(gen_mon_metrics_df.meta.userId != '',
                     gen_mon_metrics_df.meta.userId).otherwise('NA').alias(
                    "user_id"),

                when(gen_mon_metrics_df.meta.region != '',
                     gen_mon_metrics_df.meta.region).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_region_to).otherwise(
                    'NA').alias("region"),

                when(gen_mon_metrics_df.meta.zone != '',
                     gen_mon_metrics_df.meta.zone).when(
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to != '',
                    gen_mon_metrics_df.event_processing_params
                    .set_default_zone_to).otherwise(
                    'NA').alias("zone"),

                when(gen_mon_metrics_df.metric.dimensions.hostname != '',
                     gen_mon_metrics_df.metric.dimensions.hostname).when(
                    gen_mon_metrics_df.metric.value_meta.host != '',
                    gen_mon_metrics_df.metric.value_meta.host).otherwise(
                    'NA').alias("host"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_group"),

                when(gen_mon_metrics_df.service_id != '',
                     gen_mon_metrics_df.service_id).otherwise(
                    'NA').alias("service_id"),

                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'yyyy-MM-dd').alias("event_date"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'HH').alias("event_hour"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'mm').alias("event_minute"),
                from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000,
                              'ss').alias("event_second"),
                gen_mon_metrics_df.this_metric_id.alias("metric_group"),
                gen_mon_metrics_df.this_metric_id.alias("metric_id"))

            #
            # get transform context
            #
            rdd_transform_context = rdd_transform_context_rdd.first()
            transform_context = rdd_transform_context.transform_context_info

            #
            # cache record store rdd
            #
            if cfg.CONF.service.enable_record_store_df_cache:
                storage_level_prop = \
                    cfg.CONF.service.record_store_df_cache_storage_level
                storage_level = StorageUtils.get_storage_level(
                    storage_level_prop)
                record_store_df.persist(storage_level)

            #
            # start processing metrics available in record_store data
            #
            MonMetricsKafkaProcessor.process_metrics(transform_context,
                                                     record_store_df)

            # remove df from cache
            if cfg.CONF.service.enable_record_store_df_cache:
                record_store_df.unpersist()

            #
            # extract kafka offsets and batch processing time
            # stored in transform_context and save offsets
            #
            offsets = transform_context.offset_info

            # batch time
            batch_time_info = \
                transform_context.batch_time_info

            MonMetricsKafkaProcessor.save_kafka_offsets(
                offsets, rdd_transform_context_rdd.context.appName,
                batch_time_info)

            # call pre hourly processor, if its time to run
            if (cfg.CONF.stage_processors.pre_hourly_processor_enabled
                    is True and PreHourlyProcessor.is_time_to_run(
                        batch_time_info)):
                PreHourlyProcessor.run_processor(
                    record_store_df.rdd.context,
                    batch_time_info)
예제 #41
0
def gapply(grouped_data, func, schema, *cols):
    """Applies the function ``func`` to data grouped by key. In particular, given a dataframe
    grouped by some set of key columns key1, key2, ..., keyn, this method groups all the values
    for each row with the same key columns into a single Pandas dataframe and by default invokes
    ``func((key1, key2, ..., keyn), values)`` where the number and order of the key arguments is
    determined by columns on which this instance's parent :class:`DataFrame` was grouped and
    ``values`` is a ``pandas.DataFrame`` of columns selected by ``cols``, in that order.

    If there is only one key then the key tuple is automatically unpacked, with
    ``func(key, values)`` called.

    ``func`` is expected to return a ``pandas.DataFrame`` of the specified schema ``schema``,
    which should be of type :class:`StructType` (output columns are of this name and order).

    If ``spark.conf.get("spark.sql.retainGroupColumns")`` is not ``u'true'``, then ``func`` is
    called with an empty key tuple (note it is set to ``u'true'`` by default).

    If no ``cols`` are specified, then all grouped columns will be offered, in the order of the
    columns in the original dataframe. In either case, the Pandas columns will be named
    according to the DataFrame column names.

    The order of the rows passed in as Pandas rows is not guaranteed to be stable relative to
    the original row order.

    :note: Users must ensure that the grouped values for every group must fit entirely in memory.
    :note: This method is only available if Pandas is installed.

    :param grouped_data: data grouped by key
    :param func: a two argument function, which may be either a lambda or named function
    :param schema: the return schema for ``func``, a :class:`StructType`
    :param cols: list of column names (string only)

    :raise ValueError: if ``"*"`` is in ``cols``
    :raise ValueError: if ``cols`` contains duplicates
    :raise ValueError: if ``schema`` is not a :class:`StructType`
    :raise ImportError: if ``pandas`` module is not installed
    :raise ImportError: if ``pandas`` version is too old (less than 0.7.1)

    :return: the new :class:`DataFrame` with the original key columns replicated for each returned
             value in each group's resulting pandas dataframe, the schema being the original key
             schema prepended to ``schema``, where all the resulting groups' rows are concatenated.
             Of course, if retaining group columns is disabled, then the output will exactly match
             ``schema`` since no keys can be prepended.

    >>> import pandas as pd
    >>> from pyspark.sql import SparkSession
    >>> from spark_sklearn.group_apply import gapply
    >>> from spark_sklearn.util import createLocalSparkSession
    >>> spark = createLocalSparkSession()
    >>> df = (spark
    ...     .createDataFrame([Row(course="dotNET", year=2012, earnings=10000),
    ...                       Row(course="Java",   year=2012, earnings=20000),
    ...                       Row(course="dotNET", year=2012, earnings=5000),
    ...                       Row(course="dotNET", year=2013, earnings=48000),
    ...                       Row(course="Java",   year=2013, earnings=30000)])
    ...     .select("course", "year", "earnings"))
    >>> def yearlyMedian(_, vals):
    ...     all_years = set(vals['year'])
    ...     # Note that interpolation is performed, so we need to cast back to int.
    ...     yearly_median = [(year, int(vals['earnings'][vals['year'] == year].median()))
    ...                      for year in all_years]
    ...     return pd.DataFrame.from_records(yearly_median)
    >>> newSchema = StructType().add("year", LongType()).add("median_earnings", LongType())
    >>> gapply(df.groupBy("course"), yearlyMedian, newSchema).orderBy("median_earnings").show()
    +------+----+---------------+
    |course|year|median_earnings|
    +------+----+---------------+
    |dotNET|2012|           7500|
    |  Java|2012|          20000|
    |  Java|2013|          30000|
    |dotNET|2013|          48000|
    +------+----+---------------+
    <BLANKLINE>
    >>> def twoKeyYearlyMedian(_, vals):
    ...     return pd.DataFrame.from_records([(int(vals["earnings"].median()),)])
    >>> newSchema = StructType([df.schema["earnings"]])
    >>> gapply(df.groupBy("course", "year"), twoKeyYearlyMedian, newSchema, "earnings").orderBy(
    ...     "earnings").show()
    +------+----+--------+
    |course|year|earnings|
    +------+----+--------+
    |dotNET|2012|    7500|
    |  Java|2012|   20000|
    |  Java|2013|   30000|
    |dotNET|2013|   48000|
    +------+----+--------+
    <BLANKLINE>
    >>> spark.stop(); SparkSession._instantiatedContext = None
    """
    import pandas as pd
    minPandasVersion = '0.7.1'
    if LooseVersion(pd.__version__) < LooseVersion(minPandasVersion):
        raise ImportError('Pandas installed but version is {}, {} required'
                          .format(pd.__version__, minPandasVersion))

    # Do a null aggregation to retrieve the keys first (should be no computation)
    # Also consistent with spark.sql.retainGroupColumns
    keySchema = grouped_data.agg({}).schema
    keyCols = grouped_data.agg({}).columns

    if not cols:
        # Extract the full column list with the parent df
        javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df"
        parentDF = java_gateway.get_field(grouped_data._jgd, javaDFName)
        allCols = DataFrame(parentDF, None).columns
        keyColsSet = set(keyCols)
        cols = [col for col in allCols if col not in keyColsSet]

    if "*" in cols:
        raise ValueError("cols expected to contain only singular columns")

    if len(set(cols)) < len(cols):
        raise ValueError("cols expected not to contain duplicate columns")

    if not isinstance(schema, StructType):
        raise ValueError("output schema should be a StructType")

    inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols})
    # Recover canonical order (aggregation may change column order)
    canonicalOrder = chain(keyCols, [inputAggDF['collect_list(' + col + ')'] for col in cols])
    inputAggDF = inputAggDF.select(*canonicalOrder)

    # Wraps the user-provided function with another python function, which prepares the
    # input in the form specified by the documentation. Then, once the function completes,
    # this wrapper prepends the keys to the output values and converts from pandas.
    def pandasWrappedFunc(*args):
        nvals = len(cols)
        keys, collectedCols = args[:-nvals], args[-nvals:]
        paramKeys = tuple(keys)
        if len(paramKeys) == 1:
            paramKeys = paramKeys[0]
        valuesDF = pd.DataFrame.from_dict(dict(zip(cols, collectedCols)))
        valuesDF = valuesDF[list(cols)]  # reorder to canonical
        outputDF = func(paramKeys, valuesDF)
        valCols = outputDF.columns.tolist()
        for key, keyName in zip(keys, keyCols):
            outputDF[keyName] = key
        outputDF = outputDF[keyCols + valCols]  # reorder to canonical
        # To recover native python types for serialization, we need
        # to convert the pandas dataframe to a numpy array, then to a
        # native list (can't go straight to native, since pandas will
        # attempt to perserve the numpy type).
        return outputDF.values.tolist()

    keyPrependedSchema = StructType(list(chain(keySchema, schema)))
    outputAggSchema = ArrayType(keyPrependedSchema, containsNull=False)
    pandasUDF = udf(pandasWrappedFunc, outputAggSchema)
    outputAggDF = inputAggDF.select(pandasUDF(*inputAggDF))

    explodedDF = outputAggDF.select(explode(*outputAggDF).alias("gapply"))
    # automatically retrieves nested schema column names
    return explodedDF.select("gapply.*")
예제 #42
0
# drop() is like the opposite of select(): Instead of selecting specific columns from a DataFrame, it drops a specifed column from a DataFrame
dataDF.drop('occupation').drop('age').show()

# the sample() transformation returns a new DataFrame with a random sample
sampledDF = dataDF.sample(withReplacement=False, fraction=0.10)
print sampledDF.count()
sampledDF.show()

# split() and explode() transformations
from pyspark.sql.functions import split, explode

shakeWordsSplit = (shakespeareDF
                .select(split(shakespeareDF.word,' ').alias('word'))) # here split(DF,' ') splits the sentence at a space and returns each word in a single row
				
shakeWordsExplode = (shakeWordsSplit
                    .select(explode(shakeWordsSplit.word).alias('word'))) # explode() Returns a new row for each element in the given array
					
shakeWordsDF = shakeWordsExplode.filter(shakeWordsExplode.word != '') # removes all the blanks

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

###############################################################
###															###
###															###
###   						GROUP BY						###
###															###
###															###
###############################################################
예제 #43
0
파일: productdesc.py 프로젝트: AkramC/gitpy
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
spark = SparkSession.builder.appName("product desc").master(
    "local").getOrCreate()
schemaprod = StructType([
    StructField("Id", StringType(), True),
    StructField("Productdesc", StringType(), True)
])
product = spark.read.schema(schemaprod).csv("D://productdesc.csv")
productexpl = product.withColumn("tshrt", F.split("Productdesc", " ")).select(
    "Id", "Productdesc",
    F.explode("tshrt").alias("words"))
productfilter = productexpl.filter(
    F.col("words") == "Tshirt").drop("Productdesc")
win = Window.partitionBy("Id")
priducttshrtt = productfilter.select(
    "Id",
    F.count("Id").over(win).alias("Cnt")).filter(F.col("Cnt") == 2).distinct()
#priducttshrt=productfilter.groupBy("Id").count().filter(F.col("count") == 2)

priducttshrtt.show()
예제 #44
0
    .add("Date", "string").add("Source", "string").add("Len", "string") \
    .add("Likes", "string").add("RTs", "string").add("Hashtags", "string") \
    .add("UserMentionName", "string").add("UserMentionID", "string").add("name", "string") \
    .add("Place", "string").add("Followers", "float").add("Friends", "float")

lines = ssc \
    .readStream \
    .format("csv") \
    .option("header", True) \
    .schema(schema) \
    .option("sep", ";") \
    .csv('hdfs://localhost:9000/user/chaitra/stream/')

words = lines.select(
    explode(
        split(lines.Hashtags, ",")
    ).alias("Hashtags"), "ID"
)

# lines.createOrReplaceTempView("frr")

hash1 = words.groupBy("Hashtags").count().sort(desc("count"),asc("Hashtags"))
hash1.createOrReplaceTempView("hashtable")
hash1 = ssc.sql("select * from hashtable limit 5")

# frr = ssc.sql("select * from frr ")
# frr = lines.withColumn('FRRatio', frr.Followers/frr.Friends)
# frr=frr.groupBy((['name', frr.Followers, frr.Friends,frr.FRRatio])).count().sort(col("FRRatio").desc())
# frr = frr.drop("count")
# frr = frr.drop("Followers")
# frr = frr.drop("Friends")
예제 #45
0
print wordCount

# COMMAND ----------

# TEST
Test.assertEquals(wordCount, 1903220, 'incorrect word count')

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll compute the word count using `select`, the function `func.explode()`, and then taking a `count()` on the `DataFrame`.  Make sure to name the column returned by the `explode` function 'word'.

# COMMAND ----------

# ANSWER
wordList = noStopWords.select(func.explode('words').alias('word'))

# Note that we have one word per Row now
print wordList.take(3)
wordListCount = wordList.count()
print wordListCount

# COMMAND ----------

# TEST
Test.assertEquals(wordListCount, 1903220, 'incorrect value for wordListCount')

# COMMAND ----------

# MAGIC %md
# MAGIC For your final task, you'll group by word and count the number of times each word occurs.  Make sure to return the counts in descending order and to call them `counts`.
예제 #46
0
def get_weather_station_weather_df(spark, stations_id):
    ''' Download the weather station data during all hours of
        the 5 years for given station ids and return a dataframe
    '''
    cache_file = workdir + 'data/weather_stations.parquet'
    if isdir(cache_file):
        print('Skip downloading weather station: already done')
        return spark.read.parquet(cache_file)

    get_station_weather_month_udf = \
        udf(get_station_weather_month, ArrayType(StructType([
            StructField('day', IntegerType()),
            StructField('hour', IntegerType()),
            StructField('dew_point_temp', FloatType()),
            StructField('rel_hum', FloatType()),
            StructField('wind_dir', FloatType()),
            StructField('wind_spd', FloatType()),
            StructField('visibility', FloatType()),
            StructField('stn_press', FloatType()),
            StructField('hmdx', FloatType()),
            StructField('wind_chill', FloatType()),
            StructField('temp', FloatType()),
            StructField('risky_weather', FloatType())
        ])))

    month_per_year_df = spark.createDataFrame(zip(range(1, 13), ), ['month'])
    years_df = spark.createDataFrame(zip(range(2012, 2018), ), ['year'])
    months_df = years_df.crossJoin(month_per_year_df)
    stations_months_df = stations_id.crossJoin(months_df)

    c = col('col')

    def create_date(year, month, day):
        return datetime.datetime.strptime(f'{year}-{month}-{day}', "%Y-%m-%d")

    create_date_udf = udf(create_date, DateType())

    df = (stations_months_df.repartition(200, 'year', 'month').withColumn(
        'weather',
        get_station_weather_month_udf('station_id', 'year', 'month')).select(
            'station_id', 'year', 'month', explode('weather')).select(
                'station_id',
                create_date_udf('year', 'month', c['day']).alias('date'),
                c['hour'].alias('hour'),
                c['dew_point_temp'].alias('dew_point_temp'),
                c['rel_hum'].alias('rel_hum'), c['wind_dir'].alias('wind_dir'),
                c['wind_spd'].alias('wind_spd'),
                c['visibility'].alias('visibility'),
                c['stn_press'].alias('stn_press'), c['hmdx'].alias('hmdx'),
                c['wind_chill'].alias('wind_chill'), c['temp'].alias('temp'),
                c['risky_weather'].alias('risky_weather')))

    # We make a moving average of risky_weather since the effect of a risky
    # weather is distributed in the next hours
    def weighted_average(c, window, offsets, weights):
        def value(i):
            return lag(c, -i).over(window)

        values = [
            coalesce(value(i) * w, lit(0)) for i, w in zip(offsets, weights)
        ]

        return sum(values, lit(0))

    window = (Window.partitionBy('station_id').orderBy('date'))
    offsets = range(-23, 1)
    weights = [exp(0.5 * t) for t in offsets]
    weights = [w / sum(weights) for w in weights]
    df = df.withColumn(
        'risky_weather',
        weighted_average('risky_weather', window, offsets, weights))

    df.write.parquet(cache_file)

    return df
예제 #47
0
from lib.pos_tags import PosTags
from lib.chunks import Chunks

t = Tokens()
p = PosTags()
c = Chunks()

c.train(c.load_training_data("../data/chunker_training_50_fixed.json"))


def pipeline(s):
    """
    Given a string, return a list of relations
    """
    return c.assemble(c.tag(p.tag(t.tokenize(s))))


pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType())))


phrases = (
    notes.withColumn("phrases", pipeline_udf(notes["document"]))
    .select(sql.explode(sql.col("phrases")).alias("text"))
    .filter(sql.col("text")["tag"] == "NP")
    .select(sql.lower(sql.col("text")["phrase"]).alias("phrase"))
    .groupBy(sql.col("phrase"))
    .count()
)

phrases.write.parquet("../data/idigbio_phrases.parquet")
		.add("2", "string") \
		.add("3", "string") \
		.add("4", "string") \
		.add("5", "string") \
		.add("6", "string") \
		.add("7", "string") \
		.add("c8", "string") \
		.add("9", "string") \
		.add("10", "string") \
		.add("11", "string") \
		.add("12", "string") \
		.add("c13", "string") \
		.add("c14", "string")

csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("hdfs://localhost:9000/stream/")


hashtags = csvDF.select("c8")
words = hashtags.select(explode(split(hashtags.c8, ",")))
words = words.withColumnRenamed("col", "Hashtags")


word = words.groupBy("Hashtags").count().orderBy("count", ascending = False).limit(5).writeStream.outputMode("complete").format("console").start().awaitTermination(60)

# word.stop()
spark.stop()
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF.select(explode(split(shakespeareDF[0],"\s+")).alias("word"))).where("length(word) > 0")

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
예제 #50
0
 def test_count(self, input_df, default_params):
     expected_count = input_df.select(
         sql_funcs.explode(
             input_df[default_params["path_to_array"]])).count()
     actual_count = Exploder(**default_params).transform(input_df).count()
     assert expected_count == actual_count
예제 #51
0
  
  @f.pandas_udf(schema, f.PandasUDFType.GROUPED_MAP)
  def composite_udf(df):
    rows = []
    all_sum = float(df['count'].sum()) / float(df['composite_time_count'].sum())
    rows.append({"fraction": all_sum})
    data = pd.DataFrame(rows).assign(_dummy=1)
    grouping_variables = df[grouping_fields].iloc[:1].assign(_dummy=1)
    result = grouping_variables.merge(data, on="_dummy").drop("_dummy", axis=1)
    return result
  

composite_times = (subsessions.filter("app_build_id = '20181228093007'")
                   .filter("normalized_channel == 'nightly'")
                   .select('client_id','branch', 'composite_time_count', 'composite_time_sum',
                           f.explode('composite_time').alias("bucket", "count")
                          )
                   .groupBy('client_id', 'branch', 'bucket')
                   .apply(composite_udf)
                  )
composite_times.createOrReplaceGlobalTempView("composite_times")
composite_times.take(3)

# COMMAND ----------

# MAGIC %r
# MAGIC metrics = tbl(sc, "global_temp.composite_times")
# MAGIC per_user_build = metrics %>%
# MAGIC   select(branch, client_id, ends_with("_count"), ends_with("_sum")) %>%
# MAGIC   select(-starts_with("device_reset_reason")) %>%
# MAGIC   group_by(branch, client_id) %>%
# MAGIC 
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame:
# MAGIC   + The first issue is that  that we need to split each line by its spaces.
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF.select(explode(split(col('sentence'),' ')).alias('word')))

shakeWordsDF = shakeWordsDF.filter(col('word') != '')

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
 def data(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
import random
from pyspark.sql.types import *

# we randomly select maximum 10 points within a same polygon of size 11 (30m)
def sample(latitudes, longitudes):
  l = list(zip(latitudes, longitudes))
  return random.sample(l, min(len(l), 10))

sample_schema = ArrayType(StructType([StructField("latitude", DoubleType()), StructField("longitude", DoubleType())]))
sample_udf = udf(sample, sample_schema)

sample_df = (
  points_df
    .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(11)))
    .agg(F.collect_list(F.col("latitude")).alias("latitudes"), F.collect_list(F.col("longitude")).alias("longitudes"))
    .withColumn('sample', F.explode(sample_udf(F.col('latitudes'), F.col('longitudes'))))
    .select('sample.latitude', 'sample.longitude')
)
sample_df.cache().repartition(sc.defaultParallelism * 20)
sample_count = sample_df.count()

print("num_points: ", num_points)
print("sample_count:", sample_count)
print("sample %: ", (sample_count / num_points) * 100 )

#display(
#  sample_df
#    .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(9)).alias("h3"))
#    .count()
#    .orderBy(F.desc("count"))
#)
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3


# COMMAND ----------

from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)


# COMMAND ----------

from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(2)


# COMMAND ----------

from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .show(2)


# COMMAND ----------

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)
예제 #56
0
    spark = SparkSession\
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

    # Create DataSet representing the stream of input lines from kafka
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Split the lines into words
    words = lines.select(
        # explode turns each item in an array into a separate row
        explode(split(lines.value, ' ')).alias('word'))

    # Generate running word count
    wordCounts = words.groupBy('word').count()

    # Start running the query that prints the running counts to the console
    query = wordCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .start()

    query.awaitTermination()
        .getOrCreate()

    userSchema = StructType().add("value", "string")

    # Create DataFrame representing the stream of input lines from connection to host:port
    lines = spark\
        .readStream\
        .format('csv')\
        .schema(userSchema)\
        .load('history')

    # Split the lines into words
    words = lines.select(
        # explode turns each item in an array into a separate row
        explode(
            split(lines.value, ' ')
        ).alias('word')
    )

    # Generate running word count
    wordCounts = words.groupBy('word').count()

    # Start running the query that prints the running counts to the console
    query = wordCounts\
        .writeStream\
        .outputMode('complete')\
        .format('memory')\
        .queryName('table')\
        .start()
    
    # TODO
예제 #58
0
def create_subsample(spark, path):
    merge_nambiguous, em_nambiguous, merge_ambiguous, em_ambiguous = filter_df(
        spark, path
    )

    w_pattern = Window.partitionBy("pattern")
    w_entity = Window.partitionBy("nb_entities")
    w_domain = Window.partitionBy("domain")
    w_pattern_entity = Window.partitionBy("pattern", "nb_entities")

    # UNAMBIGUOUS DATA
    merge_w = merge_nambiguous.select(
        "*",
        F.count("*").over(w_pattern).alias("pattern_count"),
        F.count("*").over(w_domain).alias("domain_count"),
    ).select(
        "articleOffset",
        "articleUID",
        "full_text",
        "masked_text",
        "quotation",
        "entities",
        "speaker",
        "targets",
        "rand",
        F.when(F.col("domain_count") >= 100, F.col("domain"))
        .otherwise("others")
        .alias("domain"),
        F.when(F.col("nb_entities") <= 20, F.col("nb_entities"))
        .otherwise(21)
        .alias("nb_entities"),
        F.when(F.col("pattern_count") >= 500, F.col("pattern"))
        .otherwise("others")
        .alias("pattern"),
    )

    @F.udf(returnType=FloatType())
    def get_proba(nb_samples, max_samples=400):
        return min(1.0, max_samples / nb_samples)

    subsample = (
        merge_w.select("*", F.count("*").over(w_pattern_entity).alias("pe_count"))
        .withColumn("proba", get_proba("pe_count"))
        .filter("rand <= proba")
        .drop("rand", "pe_count", "proba")
    )

    subsample_pos, subsample_neg = subsample.randomSplit([0.8, 0.2], seed=SEED)

    subsample_pos.coalesce(32).write.parquet(
        join(path, "sampling/quootstrap_subsample_lower"),
        "overwrite",
        compression="gzip",
    )

    subsample_neg.rdd.map(create_neg_example).filter(
        lambda x: x is not None
    ).toDF().write.parquet(
        join(path, "sampling/quootstrap_subsample_neg_lower"),
        "overwrite",
        compression="gzip",
    )

    neg_examples = (
        em_nambiguous.select("*", F.explode("targets").alias("target"))
        .filter(F.col("target") == 0)
        .drop("target")
    )
    neg_examples.write.parquet(
        join(path, "sampling/neg_examples_lower"),
        "overwrite",
        compression="gzip",
    )

    em_nambiguous_target = em_nambiguous.join(
        neg_examples, on=["articleUID", "articleOffset"], how="leftanti"
    )
    em_w = em_nambiguous_target.select(
        "*", F.count("*").over(w_entity).alias("entities_count")
    ).select(
        "articleOffset",
        "articleUID",
        "full_text",
        "masked_text",
        "quotation",
        "entities",
        "speaker",
        "targets",
        "rand",
        "entities_count",
        F.when(F.col("nb_entities") <= 20, F.col("nb_entities"))
        .otherwise(21)
        .alias("nb_entities"),
    )

    @F.udf(returnType=FloatType())
    def get_proba_bis(nb_samples, max_samples=220_000):
        return min(1.0, max_samples / nb_samples)

    em_subsample = (
        em_w.withColumn("proba", get_proba_bis("entities_count"))
        .filter("rand <= proba")
        .drop("rand", "entities_count", "proba")
    )

    em_subsample_pos, em_subsample_neg = em_subsample.randomSplit([0.8, 0.2], seed=SEED)

    em_subsample_pos.write.parquet(
        join(path, "sampling/em_subsample_lower"),
        "overwrite",
        compression="gzip",
    )

    em_subsample_neg.rdd.map(create_neg_example).filter(
        lambda x: x is not None
    ).toDF().write.parquet(
        join(path, "sampling/em_subsample_neg_lower"),
        "overwrite",
        compression="gzip",
    )
예제 #59
0
# MAGIC   + The second issue is we need to filter out empty lines or words.
# MAGIC 
# MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row.  To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions).
# MAGIC 
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''.
# MAGIC 
# MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import split, explode
shakeWordsDF = (shakespeareDF
                .select(
                  explode(
                    split(
                      shakespeareDF.sentence, '\s')
                    ).alias('word')
                  )
               ).where("word != ''")

shakeWordsDF.show()
shakeWordsDFCount = shakeWordsDF.count()
print shakeWordsDFCount

# COMMAND ----------

# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------
예제 #60
0
def funnel_statistic_report_(cms_df):
    """解析 appstore_onclick\game_onclick(详情页)、download_onclick(点击下载)、download_fin(下载完成)、install_fin(安装完成) 日志数据"""
    ##json日志文件的路径
    json_path = "hdfs://master:9000/data/{0}/*/*.gz".format(str_dt_0)
    ##判断路径文件条件
    cmd = "hadoop fs -ls -R /data/{0} | egrep '.gz$' | wc -l".format(str_dt_0)
    if_zero = subprocess.check_output(cmd, shell=True).strip().split('\n')[0]
    ##判断日志文件路径是否存在
    if int(if_zero) == 0:
        print("the logs does not exists!")
        raise SystemExit(123)
    else:
        #json日志数据路径,并解析
        ##与cms_df连接条件
        condition_0_1 = (F.coalesce(F.col("t_0.package_id"),
                                    F.lit("123")) == F.coalesce(
                                        F.col("t_1.fsk_pid"), F.lit("123")))
        ##与cms_df进行左连接,得到应用类别
        df_download_onclick = spark.read.json(json_path).select(
            'custom_uuid', 'rectime',
            F.when(F.col('site') == 'ALI',
                   'youku').when(F.col('site') == 'IQIYI', 'iqiyi').when(
                       F.col('site') == 'BESTV',
                       'bestv').otherwise('others').alias('site'),
            F.explode('data.download_onclick').alias('download_onclick')
        ).filter(F.col("download_onclick.time").isNotNull()).select([
            'custom_uuid',
            F.regexp_replace(F.lit(str_dt_0), "-",
                             "").cast("int").alias("date"),
            F.lit('download_onclick').alias('featureName'),
            F.col("site").alias("site"),
            F.col("download_onclick.package_id").alias("package_id")
        ]).alias("t_0").join(cms_df.alias("t_1"), condition_0_1,
                             "left_outer").select(
                                 F.col("t_0.custom_uuid").alias("custom_uuid"),
                                 F.col("t_0.date").alias("date"),
                                 F.col("t_0.featureName").alias("featureName"),
                                 F.col("t_0.site").alias("site"),
                                 F.col("t_0.package_id").alias("package_id"),
                                 F.col("t_1.fsk_title").alias("title"),
                                 F.col("t_1.fsk_catalog").alias("fsk_catalog"))
        sql_download_fin = """ select custom_uuid,cast(regexp_replace(date,'-','') as int) as date,site,package_id from sharp.download_fin where dt='{date_0}'  """.format(
            date_0=str_dt_0)
        sql_install_fin = """ select custom_uuid,cast(regexp_replace(date,'-','') as int) as date,site,package_id from sharp.install_fin where dt='{date_0}' """.format(
            date_0=str_dt_0)
        spark.sql("show databases")
        spark.sql("use sharp")
        df_download_fin = spark.sql(sql_download_fin).alias("t_0").join(
            cms_df.alias("t_1"), condition_0_1, "left_outer").select(
                F.col("t_0.custom_uuid").alias("custom_uuid"),
                F.col("t_0.date").alias("date"),
                F.lit('download_fin').alias('featureName'),
                F.col("t_0.site").alias("site"),
                F.col("t_0.package_id").alias("package_id"),
                F.col("t_1.fsk_title").alias("title"),
                F.col("t_1.fsk_catalog").alias("fsk_catalog"))
        df_install_fin = spark.sql(sql_install_fin).alias("t_0").join(
            cms_df.alias("t_1"), condition_0_1, "left_outer").select(
                F.col("t_0.custom_uuid").alias("custom_uuid"),
                F.col("t_0.date").alias("date"),
                F.lit('install_fin').alias('featureName'),
                F.col("t_0.site").alias("site"),
                F.col("t_0.package_id").alias("package_id"),
                F.col("t_1.fsk_title").alias("title"),
                F.col("t_1.fsk_catalog").alias("fsk_catalog"))
        ##聚合操作
        df = df_download_onclick.unionAll(df_download_fin).unionAll(
            df_install_fin)
        df.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
        df.createOrReplaceTempView("v_df")
        sql_df_1 = """select date,featureName,site channelName,fsk_catalog typeName,grouping_id() id_1,count(custom_uuid) totalPlayCount from v_df group by date,featureName,site,fsk_catalog grouping sets((date,featureName,site,fsk_catalog),(date,featureName,site),(date,featureName,fsk_catalog),(date,featureName)) """
        sql_df_2 = """ select date,package_id appId,title appName,featureName,count(custom_uuid) totalPlayCount  from v_df group by date,package_id,title,featureName """
        spark.sql("show databases")
        spark.sql("use sharp")
        funnel_report_1 = spark.sql(sql_df_1)
        funnel_report_2 = spark.sql(sql_df_2)
        df.unpersist()
        ##最终报表
        return funnel_report_1, funnel_report_2