Python SQLContext.cacheTable 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark

클래스/타입: SQLContext

메소드/함수: cacheTable

hotexamples.com에서의 예제들: 4

Python SQLContext.cacheTable - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.SQLContext.cacheTable에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

예제 #1

파일 보기

파일: engine.py 프로젝트: hafizurcse/movie-rec-tutorial

    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_RATINGS) \
    .option("driver", 'com.mysql.jdbc.Driver') \
    .load()

dfMovies = sqlContext.read.format('jdbc') \
    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_MOVIES) \
    .option("driver", 'com.mysql.jdbc.Driver') \
    .load()

dfRates.registerTempTable('Rates')

sqlContext.cacheTable('Rates')

logger.info("Datasets Loaded...")

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

logger.info("Training the ALS model...")
model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed,
                  iterations=iterations, lambda_=regularization_parameter)
logger.info("ALS model built!")

# Calculate all predictions
predictions = model.recommendProductsForUsers(10) \

예제 #2

파일 보기

파일: spark_svd_postgresql.py 프로젝트: riftadi/movrecsys

TABLE_RATINGS = 'RATINGS'
TABLE_MOVIES = 'MOVIES'
TABLE_RECOMMENDATIONS = 'RECOMMENDATIONS'

t0 = time()
print "Reading data from postgresql..."
dfRates = sqlContext.read.format('jdbc') \
    .option('useSSL', False) \
    .option("url", jdbcUrl) \
    .option("dbtable", TABLE_RATINGS) \
    .option("driver", jdbcDriver) \
    .load()

dfRates.registerTempTable('Ratings')
sqlContext.cacheTable('Ratings')
tt = time() - t0
print "Data is loaded in %s seconds" % round(tt,3)

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

t0 = time()
print "Training the ALS model..."
model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed,
                  iterations=iterations, lambda_=regularization_parameter)
tt = time() - t0
print "ALS model built!"
print "New model trained in %s seconds" % round(tt,3)

예제 #3

파일 보기

        s5 = StructType(field5)
        schema5 = sqlContext.applySchema(d5, s5)
        schema5.registerTempTable("Level5")

        level3_1 = sqlContext.sql("SELECT CONSUMER_ID, KPI_1 FROM Level3 \
                            WHERE KPI_1 >= " + str(kpi_min) +
                                  " AND KPI_1 <= " + str(kpi_max) + " ")
        data3_1 = level3_1.map(lambda z: (str(z[0]), float(z[1])))
        field3_1 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI", FloatType(), True)
        ]
        s3_1 = StructType(field3_1)
        schema3_1 = sqlContext.applySchema(data3_1, s3_1)
        schema3_1.registerTempTable("Level3_1")
        sqlContext.cacheTable("Level3_1")

        level5_1 = sqlContext.sql(
            "SELECT CONSUMER_ID, CAMPAIGN_NAME, EVENT_TYPE_ID FROM Level5 WHERE EVENT_DATE BETWEEN '"
            + start_date + "' AND '" + end_date + "'")
        data5_1 = level5_1.map(lambda z: (str(z[0]), str(z[1]), int(z[2])))
        field5_1 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("CAMPAIGN_NAME", StringType(), False),
            StructField("EVENT_TYPE_ID", IntegerType(), False)
        ]
        s5_1 = StructType(field5_1)
        schema5_1 = sqlContext.applySchema(data5_1, s5_1)
        schema5_1.registerTempTable("Level5_1")
        sqlContext.cacheTable("Level5_1")

예제 #4

파일 보기

파일: spark_SparkSQL_Test_Level3.py 프로젝트: anjumr06/sparkTest

    args = parser.parse_args()

    kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level3")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")
        sqlContext.cacheTable("Level3")

        query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \
                (SELECT KPI_1, CASE WHEN KPI_1 >= "                                                        + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \
                CASE WHEN KPI_1 >= "                                         + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \
                    "' ELSE NULL END SLAB FROM Level3) DUMP \
                WHERE SLAB IS NOT NULL \
                GROUP BY SLAB, SEGMENT_ID \
                ORDER BY SEGMENT_ID"                                        )

        query3.coalesce(1).saveAsTextFile(output_l3)