Python SQLContext.load примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: SQLContext

Метод/Функция: load

Примеров на hotexamples.com: 8

Python SQLContext.load - 8 примеров найдено. Это лучшие примеры Python кода для pyspark.SQLContext.load, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

Пример #1

Показать файл

Файл: app_collaborative.py Проект: JimTravis/spark-recommendation-engine

CLOUDSQL_PWD  = sys.argv[4]

BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])

TABLE_ITEMS  = "Accommodation"
TABLE_RATINGS = "Rating"
TABLE_RECOMMENDATIONS = "Recommendation"

# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_ITEMS)
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_RATINGS)
#[END read_from_sql]

# Get all the ratings rows of our user
dfUserRatings  = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accommodations that have not been rated by our user
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

#[START split_sets]
rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
#[END split_sets]

Пример #2

Показать файл

Файл: app_collaborative_t.py Проект: watthieu/gcp-recommendation

from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root'

USER_ID = 0

# Read the data from the Cloud SQL
# Create dataframes
dfAccos = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='AccommodationT')
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='RatingT')

# Get all the ratings rows of our user
dfUserRatings = dfRates.filter(
    dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accos that have not been rated by our user
rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

Пример #3

Показать файл

    # Returns the pairs (prediction, rating)
    predictionsAndRatings = predictions.join(againstWiRatings).values()

    # Returns the variance
    return sqrt(
        predictionsAndRatings.map(lambda s: (s[0] - s[1])**2).reduce(add) /
        float(sizeAgainst))


#[END how_far]

# Read the data from the Cloud SQL
# Create dataframes
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable='Rating')

rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd
print(rddUserRatings.count())

# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = dfRates.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6, 2, 2])

#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting = rddTesting.count()

Пример #4

Показать файл

Файл: find_model_collaborative.py Проект: JimTravis/spark-recommendation-engine

  againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) )

  # Make a prediction and map it for later comparison
  # The map has to be ((user,product), rating) not ((product,user), rating)
  predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )

  # Returns the pairs (prediction, rating)
  predictionsAndRatings = predictions.join(againstWiRatings).values()

  # Returns the variance
  return sqrt(predictionsAndRatings.map(lambda s: (s[0] - s[1]) ** 2).reduce(add) / float(sizeAgainst))
#[END how_far]

# Read the data from the Cloud SQL
# Create dataframes
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='Rating')

rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd
print(rddUserRatings.count())

# Split the data in 3 different sets : training, validating, testing
# 60% 20% 20%
rddRates = dfRates.rdd
rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6,2,2])

#Add user ratings in the training model
rddTraining.union(rddUserRatings)
nbValidating = rddValidating.count()
nbTesting    = rddTesting.count()

print("Training: %d, validation: %d, test: %d" % (rddTraining.count(), nbValidating, rddTesting.count()))

Пример #5

Показать файл

BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])

TABLE_ITEMS = "Accommodation"
TABLE_RATINGS = "Rating"
TABLE_RECOMMENDATIONS = "Recommendation"

# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (
    CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable=TABLE_ITEMS)
dfRates = sqlContext.load(source='jdbc',
                          driver=jdbcDriver,
                          url=jdbcUrl,
                          dbtable=TABLE_RATINGS)
#[END read_from_sql]

# Get all the ratings rows of our user
dfUserRatings = dfRates.filter(
    dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accommodations that have not been rated by our user
rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

Пример #6

Показать файл

Файл: app_collaborative_t.py Проект: watthieu/gcp-recommendation

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("app_collaborative")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root'

USER_ID = 0

# Read the data from the Cloud SQL
# Create dataframes
dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='AccommodationT')
dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='RatingT')

# Get all the ratings rows of our user
dfUserRatings  = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accos that have not been rated by our user
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))


rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
model = ALS.train(rddTraining, 20, 20, 0.1)

"""

Пример #7

Показать файл

Файл: apriorialgoithm_pyspark.py Проект: tanhimislam/Apriori-Algorithm

        filtered_list = [list(x) for x in subset_list_filtered]
        for subset in filtered_list:
            for rule in association_rules:
                r_0 = sorted(list(rule[0]))
                r_1 = sorted(list(rule[1]))
                if subset == r_0 and len(subset) > 1 and (
                        'কথা' in subset) and len(r_1) == 1 and rule[2] > 90:
                    print(r_0, '>', r_1, rule[2])
        print('-----------------------------')


# starting point of the program

if __name__ == '__main__':
    # name of the file to read
    input_file_name = sqlContext.load(Rdd1)
    test_file_name = sqlContext.load(Rdd1)
    # minimum support threshold
    minimum_support_threshold = 3
    # minimum confidence threshold
    minimum_confidence_threshold = 90
    # creating the Apriori object
    apriori = Apriori()
    # reading data from the file
    apriori.read_file(input_file_name)
    # executing the apriori algorithm
    print(
        '##########################################################################################'
    )
    print('Training Phase')
    print(

Пример #8

Показать файл

Файл: experimentsone.py Проект: sunil3loq/sparking

sqc=SQLContext(sc)

#idea is to read the csv directly in to the dataframe of spark

#defining the schema
#msisdn,SongUniqueCode,Duration,Circle,DATE,DNIS,MODE,businesscategory
#9037991838,Hun-14-63767,202,Kolkata,10/1/2014,59090,,HindiTop20

mySchema=sql.types.StructType([
                        sql.types.StructField("msisdn",sql.types.StringType(),False),
                        sql.types.StructField("songid",sql.types.StringType(),False),
                        sql.types.StructField("duration",sql.types.IntegerType(),True),
                        sql.types.StructField("Circle",sql.types.StringType(),True),
                        sql.types.StructField("date",sql.types.StringType(),True),
                        sql.types.StructField("mode",sql.types.StringType(),True),
                        sql.types.StructField("businesscategory",sql.types.StringType(),True)
                        ])

transdf=sqc.load(source="com.databricks.spark.csv",path ="file:///home/loq/sunil/spark/content_data.csv",schema=mySchema)

transdf.take(2)

#reading the testfile way
'''
transrdd=sc.textFile("file:///home/loq/sunil/spark/content_data.csv").\
            map(lambda x: x.split(',')).\
            map(lambda y: sql.Row(msisdn=y[0],songid=y[1],duration=y[2],circle=y[3],businesscategory=y[7]))

print transrdd.take(2)
'''