def test_parse_vector(self): a = DenseVector([]) self.assertEqual(str(a), '[]') self.assertEqual(Vectors.parse(str(a)), a) a = DenseVector([3, 4, 6, 7]) self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]') self.assertEqual(Vectors.parse(str(a)), a) a = SparseVector(4, [], []) self.assertEqual(str(a), '(4,[],[])') self.assertEqual(SparseVector.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])') self.assertEqual(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
def test_parse_vector(self): a = DenseVector([3, 4, 6, 7]) self.assertTrue(str(a), "[3.0,4.0,6.0,7.0]") self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertTrue(str(a), "(4,[0,2],[3.0,4.0])") self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertTrue(SparseVector.parse(" (10, [0,1 ],[ 4.0,5.0] )"), a)
def cosineSimilarity(a, b): aMagnitude = math.sqrt(float(sum([aVal**2 for aVal in a.values]))) bMagnitude = math.sqrt(float(sum([bVal**2 for bVal in b.values]))) a2 = SparseVector.parse(str(a)) resultNumerator = a2.dot(b) resultDenominator = aMagnitude * bMagnitude if resultDenominator == 0: return 0 return resultNumerator / resultDenominator
def cosine_pre_process(line): length_matches = len(line[1]) i = 0 j = 0 s1 = SparseVector(1, [0], [1]) s2 = SparseVector(1, [0], [1]) for i in xrange(length_matches - 1): j = i while (j < length_matches - 1): j = j + 1 sf = s1.parse(line[1][i][1]) ss = (s2.parse(line[1][j][1])) dotp = sf.dot(ss) rss = np.sqrt(sum(np.square(sf.values))) * np.sqrt( sum(np.square(ss.values))) if dotp / rss > .60: if line[1][i][0] < line[1][j][0]: yield line[1][i][0], line[1][j][0] else: yield line[1][j][0], line[1][i][0]
def _parse_to_libsvm(self, param): index_l = [] value_l = [] param_l = param.split(' ') param_len = str(len(param_l) * 2) for p in param_l: index_l.append(str(int(p.split(':')[0]) - 1)) value_l.append(p.split(':')[1]) index = ','.join(index_l) value = ','.join(value_l) parsed_str = '(' + param_len + ', [' + index + '],[' + value + '])' return SparseVector.parse(parsed_str)
def main(): k_input_model = sys.argv[1] #read kmean model from this location w_input_model = sys.argv[2] #read word2vec model from this location input_file = sys.argv[3] #read input file conf = SparkConf().setAppName('Clustering') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load both kmean and Word2Vec model''' kmean_model = KMeansModel.load(sc,k_input_model) word2vec_model = Word2VecModel.load(sc,w_input_model) '''sbaronia - select fields from json and make data frame zipped with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(w_input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - here we create one vector per review, where vector contains the number of times a cluster is assinged to a word in a review. We make a SparseVector compatible format''' features = [] for i in range(len(clean_list)): histogram = [0] * 2000 for word in clean_list[i]: if word in keys_list: vec = word2vec_model.transform(word) clust = kmean_model.predict(vec) if histogram[clust] > 0: histogram[clust] = histogram[clust] + 1 else: histogram[clust] = 1 features.append((2000,range(2000),histogram)) '''sbaronia - create a normalized SparseVector rdd''' nor = Normalizer(1) features_rdd = rdd_zip(sc.parallelize(features) \ .map(lambda line: nor.transform(SparseVector.parse(line))) \ .cache()).cache() '''sbaronia - make a dataframe with rating, year and vector per review''' features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache() year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \ .drop(features_df.index).cache() '''sbaronia - create training and testing data based on year''' train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run LinearRegressionWithSGD with that step and report final RMSE''' step_best_norm = validation(train_rdd) RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm) print("Final RMSE(Normalization) = " + str(RMSE_norm) + " Best Step size = " + str(step_best_norm))
def to_labeledpoint(line): line_spl = line.split(' :: ') return LabeledPoint(line_spl[0], SparseVector.parse(line_spl[1]))
def normalized_labeledpoint(line,nor): line_spl = line.split(' :: ') return LabeledPoint(line_spl[0], nor.transform(SparseVector.parse(line_spl[1])))
# combined RDD is of [(4, ((2, 2.0), 3.4))] form normalizedRatingRDD = combinedRDD.map(lambda (x, y): (x, (y[0][0], y[0][1] - y[1]))) sparseRatingRDD = normalizedRatingRDD.groupByKey().map( lambda (x, y): (x, Vectors.sparse(numUsers, y))) ## Step 5 - Perform Recommendation for i in range(0, 10): ## 1) select a random movie, and associated rating vector randomMovieTuple = sparseRatingRDD.takeSample( False, 1)[0] # tuple ( movieID, sparseVector ) randMovieId = randomMovieTuple[0] # extract movieId randMovieVector = SparseVector.parse(str( randomMovieTuple[1])) # SparseVector associated with that movie # From this movie vector, we will randomly select a userId and set their rating to zero. # The idea would be to try to predict that rating and see how close we come to the actual value predVecValues = randMovieVector.values predVecIndices = randMovieVector.indices index = random.randint(0, len(predVecValues) - 1) predVecValues[index] = 0 # set rating to zero randUserId = predVecIndices[index] ## 2) compute cosine simularity with "randMovieVector" and each vector in RDD # result is a RDD of (cosSimValue, movieId) result = sparseRatingRDD.map(lambda v : (v[0], cosineSimilarity(v[1], randMovieVector)))\ .map(lambda x: (x[1], x[0]))\ .sortByKey(ascending=False)