def consume_records(): spark_context = SparkContext(appName='RatingConsumer') sql_context = SQLContext(spark_context) stream_reader = DataStreamReader(sql_context) fpath = os.path.join(os.environ['SPARK_DATA'], 'structured') fields = [ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True), StructField('timestamp', StringType(), True), ] schema = StructType(fields) ratings = stream_reader.load(fpath, schema=schema, format='csv') ratings.createOrReplaceTempView('ratingsView') #user_481 = sql_context.sql ("select userId, rating from ratingsView where userId < 481") user_481 = ratings.where("userId < 481").select("userId", "rating") query = user_481\ .writeStream\ .outputMode ('append')\ .format ('console')\ .start() query.awaitTermination()
def consume_records(): spark_context = SparkContext(appName='RatingConsumer') sql_context = SQLContext(spark_context) stream_reader = DataStreamReader(sql_context) fpath = os.path.join(os.environ['SPARK_DATA'], 'structured') fields = [ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True), StructField('timestamp', StringType(), True), ] schema = StructType(fields) ratings = stream_reader.load(fpath, schema=schema, format='csv') user_counts = ratings.groupBy('userId').count() query = user_counts\ .writeStream\ .outputMode ('complete')\ .format ('console')\ .start() query.awaitTermination()