def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return  cmtTuple + (onePst[0], onePst[1]) # adding title and url


    if (smallBatch): 
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]') 
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
                             #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:] 
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]') 
                                                   #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
                                 #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
                                 #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()
예제 #2
0
def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return cmtTuple + (onePst[0], onePst[1])  # adding title and url

    if (smallBatch):
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #logFile = 's3a://reddit-comments/2012/RC_2012-12'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]')
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
        #.repartition(REPARTITION_SIZE)
        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: x[0])
        nUsers = post2user.distinct().count()
        pp.pprint("distinct user number:" + str(nUsers) + "\n")

    else:

        for key in bucket.list():
            if '-' not in key.name.encode(
                    'utf-8'):  # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET,
                                             key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:]
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and
                                              int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]')
            #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
            #.repartition(REPARTITION_SIZE)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: x[0])
            nUsers = post2user.distinct()\
                                 .count()
            pp.pprint("distinct user number:" + str(nUsers) + "\n")

    sc.stop()
예제 #3
0
from pyspark import SparkContext, SparkConf, SQLContext

conf = SparkConf().setAppName("pyspark-readFromJSONinHDFS-py")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

departmentsJson = sqlContext.jsonFile(
    "/user/joseluisillana1709/department_json/department.json")
departmentsJson.registerTempTable("departmentsTable")
departmentsData = sqlContext.sql("select * from departmentsTable")
for rec in departmentsData.collect():
    print(rec)

#Writing data in json format
departmentsData.toJSON().saveAsTextFile(
    "/user/joseluisillana1709/pruebas_spark/result/departmentsJson")
def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return cmtTuple + (onePst[0], onePst[1])  # adding title and url

    if (smallBatch):
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]')
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
        #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode(
                    'utf-8'):  # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET,
                                             key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:]
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and
                                              int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]')
            #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
            #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
            #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()
def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return  cmtTuple + (onePst[0], onePst[1]) # adding title and url


    if (smallBatch): 
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #logFile = 's3a://reddit-comments/2012/RC_2012-12'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]') 
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
                             #.repartition(REPARTITION_SIZE)
        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: x[0])
        nUsers = post2user.distinct().count()
        pp.pprint("distinct user number:" + str(nUsers) + "\n")

    else:

        for key in bucket.list():
            if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:] 
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]') 
                                                   #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
                                 #.repartition(REPARTITION_SIZE)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: x[0])
            nUsers = post2user.distinct()\
                                 .count()
            pp.pprint("distinct user number:" + str(nUsers) + "\n")
            

    sc.stop()