def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) graph.foreachPartition(insert_graph) else: for key in bucket.list(): if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) #.repartition(REPARTITION_SIZE) graph.foreachPartition(insert_graph) sc.stop()
def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #logFile = 's3a://reddit-comments/2012/RC_2012-12' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: x[0]) nUsers = post2user.distinct().count() pp.pprint("distinct user number:" + str(nUsers) + "\n") else: for key in bucket.list(): if '-' not in key.name.encode( 'utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: x[0]) nUsers = post2user.distinct()\ .count() pp.pprint("distinct user number:" + str(nUsers) + "\n") sc.stop()
from pyspark import SparkContext, SparkConf, SQLContext conf = SparkConf().setAppName("pyspark-readFromJSONinHDFS-py") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) departmentsJson = sqlContext.jsonFile( "/user/joseluisillana1709/department_json/department.json") departmentsJson.registerTempTable("departmentsTable") departmentsData = sqlContext.sql("select * from departmentsTable") for rec in departmentsData.collect(): print(rec) #Writing data in json format departmentsData.toJSON().saveAsTextFile( "/user/joseluisillana1709/pruebas_spark/result/departmentsJson")
def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) graph.foreachPartition(insert_graph) else: for key in bucket.list(): if '-' not in key.name.encode( 'utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) #.repartition(REPARTITION_SIZE) graph.foreachPartition(insert_graph) sc.stop()
def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #logFile = 's3a://reddit-comments/2012/RC_2012-12' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: x[0]) nUsers = post2user.distinct().count() pp.pprint("distinct user number:" + str(nUsers) + "\n") else: for key in bucket.list(): if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: x[0]) nUsers = post2user.distinct()\ .count() pp.pprint("distinct user number:" + str(nUsers) + "\n") sc.stop()