class PyVertexRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, diff, filter, mapVertexPartitions, innerJoin and leftJoin for VertexRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.take(1) self.assertEqual(results, [(3, ("rxin", "student"))]) def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.count() self.assertEqual(results, 2) def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.mapValues(lambda x: x + ":" + x) self.assertEqual(results, [(3, ("rxin:rxin", "student:student")), (7, ("jgonzal:jgonzal", "postdoc:postdoc"))]) def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.innerJoin(vertices1).collect() self.assertEqual(results, []) def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
class PyEdgeRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, filter and innerJoin for EdgeRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() # TODO def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def filter(self): return # TODO def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
class PySparkStreamingTestCase(unittest.TestCase): timeout = 10 # seconds duration = 1 def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop() def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print "timeout after", self.timeout def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream( input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
class PyGraphXTestCase(unittest.TestCase): """ Test vertices, edges, partitionBy, numEdges, numVertices, inDegrees, outDegrees, degrees, triplets, mapVertices, mapEdges, mapTriplets, reverse, subgraph, groupEdges, joinVertices, outerJoinVertices, collectNeighborIds, collectNeighbors, mapReduceTriplets, triangleCount for Graph """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) def diff(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2) def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2) def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
def main(): reload(sys) sys.setdefaultencoding("utf-8") # spark config conf = SparkConf() conf.setMaster("local").setAppName("MemoryBasedCF") conf.set("spark.network.timeout", "3600s") conf.set("spark.executor.heartbeatInterval", "3000s") conf.set("spark.executor.memory", "10g") conf.set("spark.driver.memory", "4g") sc = SparkContext(conf=conf) sc.setCheckpointDir("checkpoint") sqlContext = SQLContext(sc) graph_path = 'graph.gml' run = sys.argv[1] frdnWalk_train = "randomWalkResult_train.txt" frdnWalk_valid = "randomWalkResult_valid.txt" frdnWalk_test = "randomWalkResult_test.txt" # find business attr mappings train_business = 'PA/Restaurants/train/PA_train_yelp_academic_dataset_business.csv' rawData_business = sqlContext.read.format("com.databricks.spark.csv")\ .option("header", "true")\ .option("inferschema", "true")\ .option("mode", "DROPMALFORMED")\ .load(train_business).rdd busiAttrMap = findBusiAttrMapping(rawData_business) if os.path.exists(graph_path): G = nx.read_gml(graph_path) with open('businessNodes.txt', 'rb') as f: B = pickle.load(f) with open('userNodes.txt', 'rb') as f: U = pickle.load(f) else: lda_upath = "user_reviews_topic.csv" lda_bpath = "business_reviews_topic.csv" LDAU, LDAB = loadLDA(sqlContext, lda_upath, lda_bpath) G, B, U = buildGraph(sc, sqlContext, LDAU, LDAB, busiAttrMap) print("Graph Loaded") if run == 'R': print('Walk Start') rdnWalkRes = graphWalk(G, B, U, 'train', sqlContext) with open(frdnWalk_train, 'wb') as f: pickle.dump(rdnWalkRes, f) elif run == 'V': B, U = getIdMaps(sqlContext, 'PA/Restaurants/valid/PA_valid_yelp_academic_dataset_review.csv', B, U) print('Walk Start') rdnWalkRes = graphWalk(G, B, U, 'valid', sqlContext) with open(frdnWalk_valid, 'wb') as f: pickle.dump(rdnWalkRes, f) elif run == 'T': B, U = getIdMaps(sqlContext, 'PA/Restaurants/test/PA_test_yelp_academic_dataset_review.csv', B, U) print('Walk Start') rdnWalkRes = graphWalk(G, B, U, 'test', sqlContext) with open(frdnWalk_test, 'wb') as f: pickle.dump(rdnWalkRes, f) return
shifts.foreachRDD(print_shifts) if __name__ == "__main__": if len(sys.argv) >= 2 and sys.argv[1] == "test": # Run the tests del sys.argv[1] conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName='unit_test', conf=conf) sc.setLogLevel("WARN") sc.setCheckpointDir("/tmp") unittest.main() sc.stop() else: # Run the main() sc = SparkContext(appName="BoostWords") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 5) ssc.checkpoint("checkpoint")
class PySparkStreamingTestCase(unittest.TestCase): timeout = 20 # seconds duration = 1 def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop() def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
# coding: utf-8 # In[1]: from pyspark.context import SparkContext from pyspark.context import SparkConf from pyspark.mllib import recommendation from pyspark.mllib.recommendation import * conf = SparkConf().setAppName("App") conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G').set( 'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '8G')) sc = SparkContext(conf=conf) sc.setCheckpointDir('tmp') sc # In[7]: user_data = sc.textFile('user_artist_data.txt') artist_data = sc.textFile('artist_data.txt') alias = sc.textFile('artist_alias.txt') # In[10]: def artist(x): k = x.rsplit('\t') if len(k) != 2: return [] else: try:
from awsglue.job import Job from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, CountVectorizer from pyspark.ml.classification import LogisticRegression from utility_functions import * from sql_steps import * from pipelines import get_features_df sc = SparkContext() glueContext = GlueContext(sc) logger = glueContext.get_logger() sc.setCheckpointDir('/tmp/') spark = glueContext.spark_session args = getResolvedOptions(sys.argv, ['JOB_NAME', 'test_arg']) logger.info("JOB SPECS...") logger.info("JOB_NAME: " + args["JOB_NAME"]) logger.info("test argument: " + args["test_arg"]) job = Job(glueContext) job.init(args['JOB_NAME'], args) logger.info("Starting to read data") df = spark.read.parquet( "s3a://alpha-data-linking/nonsensitive_test_data/1million/")
def main(): random.seed(2018) # spark config conf = SparkConf() conf.setMaster("local").setAppName("MemoryBasedCF") conf.set("spark.network.timeout", "3600s") conf.set("spark.executor.heartbeatInterval", "3000s") sc = SparkContext(conf=conf) #sc.setLogLevel("ERROR") sc.setCheckpointDir("checkpoint") sqlContext = SQLContext(sc) ''' load train data ''' train_path = 'PA/Restaurants/train/' train_user = train_path + 'PA_train_yelp_academic_dataset_user.csv' train_review = train_path + 'PA_train_yelp_academic_dataset_review.csv' train_business = train_path + 'PA_train_yelp_academic_dataset_business.csv' train_tips = train_path + 'PA_train_yelp_academic_dataset_tip.csv' train_checkin = train_path + 'PA_train_yelp_academic_dataset_checkin.csv' schema_review = StructType([ StructField("funny", IntegerType()), StructField("user_id", StringType()), StructField("review_id", StringType()), StructField("text", StringType()), StructField("business_id", StringType()), StructField("stars", IntegerType()), StructField("date", StringType()), StructField("useful", IntegerType()), StructField("cool", IntegerType()), StructField("1overN", DoubleType()), StructField("2overN", DoubleType()), StructField("percentile", DoubleType()) ]) rawData_review = sqlContext.read.format("com.databricks.spark.csv") \ .option("header", "true") \ .option("inferschema", "true") \ .option("mode", "DROPMALFORMED") \ .schema(schema_review) \ .load(train_review).rdd rawData_business = sqlContext.read.format("com.databricks.spark.csv")\ .option("header", "true")\ .option("inferschema", "true")\ .option("mode", "DROPMALFORMED")\ .load(train_business).rdd # Step1: find nn for users using review text userReviewVec, lshf, transformer, idMap, stop_words = findNN(rawData_review) print "Step1 Completed" # Step2: find business attr mappings busiAttrMap = findBusiAttrMapping(rawData_business) print "Step2 Completed" # Step3: get user business map userBusiMap = rawData_review.map(lambda x: (x[1], [x[4]])).reduceByKey(lambda x, y: x + y).collectAsMap() print "Step3 Completed" # Step4: for each user in knn find its business, then compute a weighted vote on their business #userReviewVec = userReviewVec.collectAsMap() #print(weightedVote(userReviewVec['IjVuk0tawvT0ygazmrBQEg'], userBusiMap, busiAttrMap)) userReviewVec = userReviewVec.map(lambda x: (x[0], weightedVote(x[1], userBusiMap, busiAttrMap))) #print userReviewVec.collectAsMap()['IjVuk0tawvT0ygazmrBQEg'] print "Step4 Completed" # Step5: find true business mapping # run train on train for test first #true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])) true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])).collectAsMap() print "Step5 Completed" # Step6: join prediction and true val #result = userReviewVec.collect() #result2 = true_review.collect()#.join(true_review) result = userReviewVec.filter(lambda x: x[0] in true_review).map(lambda x: (x[0], (x[1], true_review[x[0]]))) print "Step6 Completed" # Step7: Compute error between prediction and true mapping MSE = result.map(lambda x: computeMSE(x[1][0], x[1][1])).collect() MSE = np.mean(MSE, axis=0) RMSE = MSE ** 0.5 print "Step7 Completed" # Step8: Output the results print RMSE with open('result_train.txt', 'w') as f: f.writelines([str(RMSE)]) result = result.sortByKey() if os.path.exists('result/train_result'): shutil.rmtree('result/train_result') result.saveAsTextFile('result/train_result') print "Step8 Completed" # Step9: Run validation data valid_path = 'PA/Restaurants/valid/' valid_review = valid_path + 'PA_valid_yelp_academic_dataset_review.csv' valid_business = valid_path + 'PA_valid_yelp_academic_dataset_business.csv' runPrediction(sc, sqlContext, valid_review, valid_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "valid") print "Step9 Completed" # Step10: Run Test data test_path = 'PA/Restaurants/test/' test_review = test_path + 'PA_test_yelp_academic_dataset_review.csv' test_business = test_path + 'PA_test_yelp_academic_dataset_business.csv' runPrediction(sc, sqlContext, test_review, test_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "test") print "Step10 Completed" return
# Print the results shifts.foreachRDD(print_shifts) if __name__ == "__main__": if len(sys.argv) >= 2 and sys.argv[1] == "test": # Run the tests del sys.argv[1] conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName='unit_test', conf=conf) sc.setLogLevel("WARN") sc.setCheckpointDir("/tmp") unittest.main() sc.stop() else: # Run the main() sc = SparkContext(appName="BoostWords") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 5) ssc.checkpoint("checkpoint")