示例#1
0
class PyVertexRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues, diff,
    filter, mapVertexPartitions, innerJoin and leftJoin
    for VertexRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.take(1)
        self.assertEqual(results, [(3, ("rxin", "student"))])

    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.count()
        self.assertEqual(results, 2)

    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.mapValues(lambda x: x + ":" + x)
        self.assertEqual(results, [(3, ("rxin:rxin", "student:student")),
                                   (7, ("jgonzal:jgonzal", "postdoc:postdoc"))])

    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.innerJoin(vertices1).collect()
        self.assertEqual(results, [])

    def leftJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
示例#2
0
class PyEdgeRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues,
    filter and innerJoin for EdgeRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    # TODO
    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def filter(self):
        return

    # TODO
    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
示例#3
0
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 10  # seconds
    duration = 1

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")
        # TODO: decrease duration to speed up tests
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        self.ssc.stop()

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print "timeout after", self.timeout

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(
            input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

        result = self._collect(stream, len(expected))
        if sort:
            self._sort_result_based_on_key(result)
            self._sort_result_based_on_key(expected)
        self.assertEqual(expected, result)

    def _sort_result_based_on_key(self, outputs):
        """Sort the list based on first value."""
        for output in outputs:
            output.sort(key=lambda x: x[0])
示例#4
0
class PyGraphXTestCase(unittest.TestCase):
    """
    Test vertices, edges, partitionBy, numEdges, numVertices,
    inDegrees, outDegrees, degrees, triplets, mapVertices,
    mapEdges, mapTriplets, reverse, subgraph, groupEdges,
    joinVertices, outerJoinVertices, collectNeighborIds,
    collectNeighbors, mapReduceTriplets, triangleCount for Graph
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    def diff(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)

    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)

    def leftJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
示例#5
0
def main():
    reload(sys)
    sys.setdefaultencoding("utf-8")

    # spark config
    conf = SparkConf()
    conf.setMaster("local").setAppName("MemoryBasedCF")
    conf.set("spark.network.timeout", "3600s")
    conf.set("spark.executor.heartbeatInterval", "3000s")
    conf.set("spark.executor.memory", "10g")
    conf.set("spark.driver.memory", "4g")

    sc = SparkContext(conf=conf)
    sc.setCheckpointDir("checkpoint")
    sqlContext = SQLContext(sc)

    graph_path = 'graph.gml'

    run = sys.argv[1]

    frdnWalk_train = "randomWalkResult_train.txt"
    frdnWalk_valid = "randomWalkResult_valid.txt"
    frdnWalk_test = "randomWalkResult_test.txt"

    # find business attr mappings
    train_business = 'PA/Restaurants/train/PA_train_yelp_academic_dataset_business.csv'
    rawData_business = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load(train_business).rdd

    busiAttrMap = findBusiAttrMapping(rawData_business)

    if os.path.exists(graph_path):
        G = nx.read_gml(graph_path)
        with open('businessNodes.txt', 'rb') as f:
            B = pickle.load(f)
        with open('userNodes.txt', 'rb') as f:
            U = pickle.load(f)

    else:
        lda_upath = "user_reviews_topic.csv"
        lda_bpath = "business_reviews_topic.csv"
        LDAU, LDAB = loadLDA(sqlContext, lda_upath, lda_bpath)
        G, B, U = buildGraph(sc, sqlContext, LDAU, LDAB, busiAttrMap)

    print("Graph Loaded")

    if run == 'R':
        print('Walk Start')
        rdnWalkRes = graphWalk(G, B, U, 'train', sqlContext)
        with open(frdnWalk_train, 'wb') as f:
            pickle.dump(rdnWalkRes, f)
    elif run == 'V':
        B, U = getIdMaps(sqlContext, 'PA/Restaurants/valid/PA_valid_yelp_academic_dataset_review.csv', B, U)
        print('Walk Start')
        rdnWalkRes = graphWalk(G, B, U, 'valid', sqlContext)
        with open(frdnWalk_valid, 'wb') as f:
            pickle.dump(rdnWalkRes, f)
    elif run == 'T':
        B, U = getIdMaps(sqlContext, 'PA/Restaurants/test/PA_test_yelp_academic_dataset_review.csv', B, U)
        print('Walk Start')
        rdnWalkRes = graphWalk(G, B, U, 'test', sqlContext)
        with open(frdnWalk_test, 'wb') as f:
            pickle.dump(rdnWalkRes, f)

    return
    shifts.foreachRDD(print_shifts)


if __name__ == "__main__":

    if len(sys.argv) >= 2 and sys.argv[1] == "test":
        # Run the tests
        del sys.argv[1]

        conf = SparkConf().set("spark.default.parallelism", 1)

        sc = SparkContext(appName='unit_test', conf=conf)

        sc.setLogLevel("WARN")

        sc.setCheckpointDir("/tmp")

        unittest.main()

        sc.stop()

    else:
        # Run the main()
        sc = SparkContext(appName="BoostWords")

        sc.setLogLevel("WARN")

        ssc = StreamingContext(sc, 5)

        ssc.checkpoint("checkpoint")
示例#7
0
文件: tests.py 项目: 31z4/spark
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 20  # seconds
    duration = 1

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")
        # TODO: decrease duration to speed up tests
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        self.ssc.stop()

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

        result = self._collect(stream, len(expected))
        if sort:
            self._sort_result_based_on_key(result)
            self._sort_result_based_on_key(expected)
        self.assertEqual(expected, result)

    def _sort_result_based_on_key(self, outputs):
        """Sort the list based on first value."""
        for output in outputs:
            output.sort(key=lambda x: x[0])
示例#8
0
# coding: utf-8

# In[1]:

from pyspark.context import SparkContext
from pyspark.context import SparkConf
from pyspark.mllib import recommendation
from pyspark.mllib.recommendation import *

conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G').set(
    'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '8G'))
sc = SparkContext(conf=conf)
sc.setCheckpointDir('tmp')
sc

# In[7]:

user_data = sc.textFile('user_artist_data.txt')
artist_data = sc.textFile('artist_data.txt')
alias = sc.textFile('artist_alias.txt')

# In[10]:


def artist(x):
    k = x.rsplit('\t')
    if len(k) != 2:
        return []
    else:
        try:
示例#9
0
from awsglue.job import Job


from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, CountVectorizer
from pyspark.ml.classification import LogisticRegression

from utility_functions import *
from sql_steps import *
from pipelines import get_features_df



sc = SparkContext()
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
sc.setCheckpointDir('/tmp/')
spark = glueContext.spark_session

args = getResolvedOptions(sys.argv, ['JOB_NAME', 'test_arg'])

logger.info("JOB SPECS...")
logger.info("JOB_NAME: " + args["JOB_NAME"])
logger.info("test argument: " +  args["test_arg"])

job = Job(glueContext)
job.init(args['JOB_NAME'], args)


logger.info("Starting to read data")
df = spark.read.parquet(
    "s3a://alpha-data-linking/nonsensitive_test_data/1million/")
示例#10
0
def main():
    random.seed(2018)

    # spark config
    conf = SparkConf()
    conf.setMaster("local").setAppName("MemoryBasedCF")
    conf.set("spark.network.timeout", "3600s")
    conf.set("spark.executor.heartbeatInterval", "3000s")

    sc = SparkContext(conf=conf)
    #sc.setLogLevel("ERROR")
    sc.setCheckpointDir("checkpoint")
    sqlContext = SQLContext(sc)

    '''
    load train data
    '''
    train_path = 'PA/Restaurants/train/'
    train_user = train_path + 'PA_train_yelp_academic_dataset_user.csv'
    train_review = train_path + 'PA_train_yelp_academic_dataset_review.csv'
    train_business = train_path + 'PA_train_yelp_academic_dataset_business.csv'
    train_tips = train_path + 'PA_train_yelp_academic_dataset_tip.csv'
    train_checkin = train_path + 'PA_train_yelp_academic_dataset_checkin.csv'

    schema_review = StructType([
        StructField("funny", IntegerType()),
        StructField("user_id", StringType()),
        StructField("review_id", StringType()),
        StructField("text", StringType()),
        StructField("business_id", StringType()),
        StructField("stars", IntegerType()),
        StructField("date", StringType()),
        StructField("useful", IntegerType()),
        StructField("cool", IntegerType()),
        StructField("1overN", DoubleType()),
        StructField("2overN", DoubleType()),
        StructField("percentile", DoubleType())
    ])

    rawData_review = sqlContext.read.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .option("inferschema", "true") \
        .option("mode", "DROPMALFORMED") \
        .schema(schema_review) \
        .load(train_review).rdd

    rawData_business = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load(train_business).rdd

    # Step1: find nn for users using review text
    userReviewVec, lshf, transformer, idMap, stop_words = findNN(rawData_review)

    print "Step1 Completed"

    # Step2: find business attr mappings
    busiAttrMap = findBusiAttrMapping(rawData_business)

    print "Step2 Completed"

    # Step3: get user business map
    userBusiMap = rawData_review.map(lambda x: (x[1], [x[4]])).reduceByKey(lambda x, y: x + y).collectAsMap()

    print "Step3 Completed"

    # Step4: for each user in knn find its business, then compute a weighted vote on their business
    #userReviewVec = userReviewVec.collectAsMap()
    #print(weightedVote(userReviewVec['IjVuk0tawvT0ygazmrBQEg'], userBusiMap, busiAttrMap))
    userReviewVec = userReviewVec.map(lambda x: (x[0], weightedVote(x[1], userBusiMap, busiAttrMap)))
    #print userReviewVec.collectAsMap()['IjVuk0tawvT0ygazmrBQEg']

    print "Step4 Completed"

    # Step5: find true business mapping
    # run train on train for test first
    #true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])]))
    true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])).collectAsMap()

    print "Step5 Completed"

    # Step6: join prediction and true val
    #result = userReviewVec.collect()
    #result2 = true_review.collect()#.join(true_review)

    result = userReviewVec.filter(lambda x: x[0] in true_review).map(lambda x: (x[0], (x[1], true_review[x[0]])))

    print "Step6 Completed"

    # Step7: Compute error between prediction and true mapping
    MSE = result.map(lambda x: computeMSE(x[1][0], x[1][1])).collect()
    MSE = np.mean(MSE, axis=0)
    RMSE = MSE ** 0.5

    print "Step7 Completed"

    # Step8: Output the results

    print RMSE

    with open('result_train.txt', 'w') as f:
        f.writelines([str(RMSE)])

    result = result.sortByKey()
    if os.path.exists('result/train_result'):
        shutil.rmtree('result/train_result')
    result.saveAsTextFile('result/train_result')

    print "Step8 Completed"

    # Step9: Run validation data

    valid_path = 'PA/Restaurants/valid/'
    valid_review = valid_path + 'PA_valid_yelp_academic_dataset_review.csv'
    valid_business = valid_path + 'PA_valid_yelp_academic_dataset_business.csv'

    runPrediction(sc, sqlContext, valid_review, valid_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "valid")

    print "Step9 Completed"

    # Step10: Run Test data

    test_path = 'PA/Restaurants/test/'
    test_review = test_path + 'PA_test_yelp_academic_dataset_review.csv'
    test_business = test_path + 'PA_test_yelp_academic_dataset_business.csv'

    runPrediction(sc, sqlContext, test_review, test_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "test")

    print "Step10 Completed"

    return
示例#11
0
    # Print the results
    shifts.foreachRDD(print_shifts)
            
if __name__ == "__main__":

    if len(sys.argv) >= 2 and sys.argv[1] == "test":
        # Run the tests
        del sys.argv[1]

        conf = SparkConf().set("spark.default.parallelism", 1)

        sc = SparkContext(appName='unit_test', conf=conf)

        sc.setLogLevel("WARN")

        sc.setCheckpointDir("/tmp")

        unittest.main()

        sc.stop()

    else:
        # Run the main()
        sc = SparkContext(appName="BoostWords")

        sc.setLogLevel("WARN")

        ssc = StreamingContext(sc, 5)

        ssc.checkpoint("checkpoint")