CLOUDSQL_PWD = sys.argv[4] BEST_RANK = int(sys.argv[5]) BEST_ITERATION = int(sys.argv[6]) BEST_REGULATION = float(sys.argv[7]) TABLE_ITEMS = "Accommodation" TABLE_RATINGS = "Rating" TABLE_RECOMMENDATIONS = "Recommendation" # Read the data from the Cloud SQL # Create dataframes #[START read_from_sql] jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_ITEMS) dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_RATINGS) #[END read_from_sql] # Get all the ratings rows of our user dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect() print(dfUserRatings) # Returns only the accommodations that have not been rated by our user rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) #[START split_sets] rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2]) #[END split_sets]
from pyspark.sql.types import StructField from pyspark.sql.types import StringType conf = SparkConf().setAppName("app_collaborative") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root' USER_ID = 0 # Read the data from the Cloud SQL # Create dataframes dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='AccommodationT') dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='RatingT') # Get all the ratings rows of our user dfUserRatings = dfRates.filter( dfRates.userId == USER_ID).map(lambda r: r.accoId).collect() print(dfUserRatings) # Returns only the accos that have not been rated by our user rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
# Returns the pairs (prediction, rating) predictionsAndRatings = predictions.join(againstWiRatings).values() # Returns the variance return sqrt( predictionsAndRatings.map(lambda s: (s[0] - s[1])**2).reduce(add) / float(sizeAgainst)) #[END how_far] # Read the data from the Cloud SQL # Create dataframes dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='Rating') rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd print(rddUserRatings.count()) # Split the data in 3 different sets : training, validating, testing # 60% 20% 20% rddRates = dfRates.rdd rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6, 2, 2]) #Add user ratings in the training model rddTraining.union(rddUserRatings) nbValidating = rddValidating.count() nbTesting = rddTesting.count()
againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) ) # Make a prediction and map it for later comparison # The map has to be ((user,product), rating) not ((product,user), rating) predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) ) # Returns the pairs (prediction, rating) predictionsAndRatings = predictions.join(againstWiRatings).values() # Returns the variance return sqrt(predictionsAndRatings.map(lambda s: (s[0] - s[1]) ** 2).reduce(add) / float(sizeAgainst)) #[END how_far] # Read the data from the Cloud SQL # Create dataframes dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='Rating') rddUserRatings = dfRates.filter(dfRates.userId == 0).rdd print(rddUserRatings.count()) # Split the data in 3 different sets : training, validating, testing # 60% 20% 20% rddRates = dfRates.rdd rddTraining, rddValidating, rddTesting = rddRates.randomSplit([6,2,2]) #Add user ratings in the training model rddTraining.union(rddUserRatings) nbValidating = rddValidating.count() nbTesting = rddTesting.count() print("Training: %d, validation: %d, test: %d" % (rddTraining.count(), nbValidating, rddTesting.count()))
BEST_RANK = int(sys.argv[5]) BEST_ITERATION = int(sys.argv[6]) BEST_REGULATION = float(sys.argv[7]) TABLE_ITEMS = "Accommodation" TABLE_RATINGS = "Rating" TABLE_RECOMMENDATIONS = "Recommendation" # Read the data from the Cloud SQL # Create dataframes #[START read_from_sql] jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % ( CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_ITEMS) dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable=TABLE_RATINGS) #[END read_from_sql] # Get all the ratings rows of our user dfUserRatings = dfRates.filter( dfRates.userId == USER_ID).map(lambda r: r.accoId).collect() print(dfUserRatings) # Returns only the accommodations that have not been rated by our user rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType conf = SparkConf().setAppName("app_collaborative") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://173.194.227.120:3306/recoom?user=root' USER_ID = 0 # Read the data from the Cloud SQL # Create dataframes dfAccos = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='AccommodationT') dfRates = sqlContext.load(source='jdbc', driver=jdbcDriver, url=jdbcUrl, dbtable='RatingT') # Get all the ratings rows of our user dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect() print(dfUserRatings) # Returns only the accos that have not been rated by our user rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2]) model = ALS.train(rddTraining, 20, 20, 0.1) """
filtered_list = [list(x) for x in subset_list_filtered] for subset in filtered_list: for rule in association_rules: r_0 = sorted(list(rule[0])) r_1 = sorted(list(rule[1])) if subset == r_0 and len(subset) > 1 and ( 'কথা' in subset) and len(r_1) == 1 and rule[2] > 90: print(r_0, '>', r_1, rule[2]) print('-----------------------------') # starting point of the program if __name__ == '__main__': # name of the file to read input_file_name = sqlContext.load(Rdd1) test_file_name = sqlContext.load(Rdd1) # minimum support threshold minimum_support_threshold = 3 # minimum confidence threshold minimum_confidence_threshold = 90 # creating the Apriori object apriori = Apriori() # reading data from the file apriori.read_file(input_file_name) # executing the apriori algorithm print( '##########################################################################################' ) print('Training Phase') print(
sqc=SQLContext(sc) #idea is to read the csv directly in to the dataframe of spark #defining the schema #msisdn,SongUniqueCode,Duration,Circle,DATE,DNIS,MODE,businesscategory #9037991838,Hun-14-63767,202,Kolkata,10/1/2014,59090,,HindiTop20 mySchema=sql.types.StructType([ sql.types.StructField("msisdn",sql.types.StringType(),False), sql.types.StructField("songid",sql.types.StringType(),False), sql.types.StructField("duration",sql.types.IntegerType(),True), sql.types.StructField("Circle",sql.types.StringType(),True), sql.types.StructField("date",sql.types.StringType(),True), sql.types.StructField("mode",sql.types.StringType(),True), sql.types.StructField("businesscategory",sql.types.StringType(),True) ]) transdf=sqc.load(source="com.databricks.spark.csv",path ="file:///home/loq/sunil/spark/content_data.csv",schema=mySchema) transdf.take(2) #reading the testfile way ''' transrdd=sc.textFile("file:///home/loq/sunil/spark/content_data.csv").\ map(lambda x: x.split(',')).\ map(lambda y: sql.Row(msisdn=y[0],songid=y[1],duration=y[2],circle=y[3],businesscategory=y[7])) print transrdd.take(2) '''