Python SQLContext.jsonFile примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: SQLContext

Метод/Функция: jsonFile

Примеров на hotexamples.com: 5

Python SQLContext.jsonFile - 5 примеров найдено. Это лучшие примеры Python кода для pyspark.SQLContext.jsonFile, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

Пример #1

Показать файл

Файл: reddit2cassandra.py Проект: europelee/HottestTopicOnReddit

def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return  cmtTuple + (onePst[0], onePst[1]) # adding title and url


    if (smallBatch): 
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]') 
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
                             #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:] 
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]') 
                                                   #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
                                 #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
                                 #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()

Пример #2

Показать файл

def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return cmtTuple + (onePst[0], onePst[1])  # adding title and url

    if (smallBatch):
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #logFile = 's3a://reddit-comments/2012/RC_2012-12'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]')
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
        #.repartition(REPARTITION_SIZE)
        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: x[0])
        nUsers = post2user.distinct().count()
        pp.pprint("distinct user number:" + str(nUsers) + "\n")

    else:

        for key in bucket.list():
            if '-' not in key.name.encode(
                    'utf-8'):  # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET,
                                             key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:]
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and
                                              int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]')
            #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
            #.repartition(REPARTITION_SIZE)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: x[0])
            nUsers = post2user.distinct()\
                                 .count()
            pp.pprint("distinct user number:" + str(nUsers) + "\n")

    sc.stop()

Пример #3

Показать файл

from pyspark import SparkContext, SparkConf, SQLContext

conf = SparkConf().setAppName("pyspark-readFromJSONinHDFS-py")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

departmentsJson = sqlContext.jsonFile(
    "/user/joseluisillana1709/department_json/department.json")
departmentsJson.registerTempTable("departmentsTable")
departmentsData = sqlContext.sql("select * from departmentsTable")
for rec in departmentsData.collect():
    print(rec)

#Writing data in json format
departmentsData.toJSON().saveAsTextFile(
    "/user/joseluisillana1709/pruebas_spark/result/departmentsJson")

Пример #4

Показать файл

Файл: reddit2cassandra.py Проект: lingding0/HottestTopicOnReddit

def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return cmtTuple + (onePst[0], onePst[1])  # adding title and url

    if (smallBatch):
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]')
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
        #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode(
                    'utf-8'):  # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET,
                                             key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:]
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and
                                              int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]')
            #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
            #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
            #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()

Пример #5

Показать файл

Файл: distinctUser.py Проект: europelee/HottestTopicOnReddit

def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return  cmtTuple + (onePst[0], onePst[1]) # adding title and url


    if (smallBatch): 
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #logFile = 's3a://reddit-comments/2012/RC_2012-12'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]') 
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
                             #.repartition(REPARTITION_SIZE)
        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: x[0])
        nUsers = post2user.distinct().count()
        pp.pprint("distinct user number:" + str(nUsers) + "\n")

    else:

        for key in bucket.list():
            if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:] 
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]') 
                                                   #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
                                 #.repartition(REPARTITION_SIZE)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: x[0])
            nUsers = post2user.distinct()\
                                 .count()
            pp.pprint("distinct user number:" + str(nUsers) + "\n")
            

    sc.stop()