Пример #1
0
    def create_context(n_processes=0):
        if not n_processes:
            return gelanis.Context()

        pool = futures.ProcessPoolExecutor(n_processes)
        return gelanis.Context(
            pool=pool,
            serializer=cloudpickle.dumps,
            # serializer=pickle.dumps,
            deserializer=pickle.loads)
Пример #2
0
 def test_lock1(self):
     """Should not be able to create a new RDD inside a map operation."""
     sc = gelanis.Context()
     self.assertRaises(
         gelanis.exceptions.ContextIsLockedException, lambda:
         (sc.parallelize(range(5)).map(lambda _: sc.parallelize([1])).
          collect()))
Пример #3
0
    def _run_process(self, n, to_kv, format_):
        c = gelanis.Context()
        stream_c = gelanis.streaming.StreamingContext(c, 1.0)

        counts = []
        sensor_sums = defaultdict(float)
        sensor_squares = defaultdict(float)
        sensor_counts = defaultdict(int)
        if format_ not in ('bello', 'struct'):
            t = stream_c.socketTextStream('localhost', self.port)
        else:
            length = {'bello': 5, 'struct': 8}[format_]
            t = stream_c.socketBinaryStream('localhost', self.port, length)
        t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0]))
        if to_kv is not None:

            def update(rdd):
                for k, v in rdd.collect():
                    sensor_sums[k] += sum(v)
                    sensor_squares[k] += sum(vv**2 for vv in v)
                    sensor_counts[k] += len(v)

            t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd))

        self.client(n, format_=format_)

        stream_c.start()
        stream_c.awaitTermination(timeout=5.0)

        return (counts, sensor_sums, sensor_squares, sensor_counts)
Пример #4
0
 def test_union(self):
     sc = gelanis.Context()
     rdd1 = sc.parallelize(['Hello'])
     rdd2 = sc.parallelize(['World'])
     union = sc.union([rdd1, rdd2]).collect()
     print(union)
     self.assertEqual(union, ['Hello', 'World'])
Пример #5
0
def main():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()

    sc = gelanis.Context()
    sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz')
    rdd = sc.textFile(tempFile.name + '.gz')
    rdd.collect()
Пример #6
0
    def test_lock2(self):
        """Should not be able to create RDDs containing RDDs."""
        sc = gelanis.Context()

        def parallelize_in_parallelize():
            o = sc.parallelize(sc.parallelize(range(x)) for x in range(5))
            print(o.map(lambda x: x.collect()).collect())

        self.assertRaises(gelanis.exceptions.ContextIsLockedException,
                          parallelize_in_parallelize)
Пример #7
0
    def test_mapValues(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]]).mapValues(
            sorted).foreachRDD(lambda rdd: result.append(rdd.collect())))

        ssc.start()
        ssc.awaitTermination(timeout=0.15)
        self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
Пример #8
0
    def test_connect(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.textFileStream('LICENS*', process_all=True).count().foreachRDD(
            lambda rdd: result.append(rdd.collect()[0])))

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(sum(result), 44)
Пример #9
0
def test_cache_empty_partition():
    m = Manip()

    c = gelanis.Context()
    rdd = c.parallelize(range(10), 2)
    rdd = rdd.map(m.trivial_manip_with_debug)
    rdd = rdd.filter(lambda e: e > 6).cache()
    print(rdd.collect())
    print(rdd.collect())

    print(f'count of map executions: {m.count}')
    assert m.count == 10
Пример #10
0
    def test_read_chunks(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.fileBinaryStream('LICENS*', recordLength=40,
                              process_all=True).count().foreachRDD(
                                  lambda rdd: result.append(rdd.collect()[0])))

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(sum(result), 54)
Пример #11
0
    def test_count(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.queueStream([
            range(20), ['a', 'b'], ['c']
        ]).count().foreachRDD(lambda rdd: result.append(rdd.collect()[0])))

        ssc.start()
        ssc.awaitTermination(timeout=0.35)
        self.assertEqual(sum(result), 23)
Пример #12
0
    def test_main(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        counter = Counter()
        (
            ssc.socketBinaryStream('127.0.0.1', 8125, length='<I')
            .foreachRDD(lambda rdd: counter.update(rdd.collect()))
        )
        self.client()

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(counter[b'hellohello'], 1)
Пример #13
0
    def test_groupByKey(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], [
            ('a', 2), ('b', 3)
        ]]).groupByKey().mapPartitions(sorted).mapValues(sorted).foreachRDD(
            lambda rdd: result.append(rdd.collect())))

        ssc.start()
        ssc.awaitTermination(timeout=0.25)
        self.assertEqual(result, [[('a', [2, 5]),
                                   ('b', [8])], [('a', [2]), ('b', [3])]])
Пример #14
0
    def test_retry(self):
        class EverySecondCallFails:
            def __init__(self):
                self.attempt = 0

            def __call__(self, value):
                self.attempt += 1
                if self.attempt % 2 == 1:
                    raise Exception
                return value

        data = list(range(6))
        rdd = gelanis.Context().parallelize(data, 3)
        result = rdd.mapPartitions(EverySecondCallFails()).collect()
        self.assertEqual(result, data)
Пример #15
0
    def test_connect(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        counter = Counter()
        (
            ssc.socketTextStream('127.0.0.1', 8123)
            .foreachRDD(lambda rdd:
                        counter.update(''.join(rdd.collect()))
                        if rdd.collect() else None)
        )
        self.client()

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(counter['a'], 20)
Пример #16
0
def test_timed_cache():
    m = Manip()

    # create a timed cache manager
    cm = gelanis.TimedCacheManager(timeout=1.0)

    # create a cache entry
    c = gelanis.Context(cache_manager=cm)
    rdd = c.parallelize(range(10), 2)
    rdd = rdd.map(m.trivial_manip_with_debug).cache()
    print(rdd.collect())
    # make sure the cache is working
    count_before = m.count
    print(rdd.collect())
    count_after = m.count
    assert count_before == count_after

    # wait to have the cache expire
    time.sleep(1.5)
    cm.gc()
    print(rdd.collect())
    assert m.count > count_after
Пример #17
0
 def test_parallelize_matched_elements(self):
     my_rdd = gelanis.Context().parallelize([1, 2, 3, 4, 5], 5)
     self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5])
Пример #18
0
def main():
    sc = gelanis.Context()
    ssc = gelanis.streaming.StreamingContext(sc, 1)
    ssc.textFileStream('/var/log/system.log*').pprint()
    ssc.start()
    ssc.awaitTermination(timeout=3.0)
Пример #19
0
def read_csv(filename):
    c = gelanis.Context()
    r = c.textFile(filename)
    r = r.map(lambda l: l + 'something else')
    print(r.count())
Пример #20
0
 def setUp(self):
     self.pool = multiprocessing.Pool(4)
     self.sc = gelanis.Context(pool=self.pool,
                               serializer=cloudpickle.dumps,
                               deserializer=pickle.loads)
Пример #21
0
 def setUp(self):
     self.pool = multiprocessing.Pool(4)
     self.sc = gelanis.Context(pool=self.pool)
Пример #22
0
    def test_save_gz(self):
        sc = gelanis.Context()
        ssc = gelanis.streaming.StreamingContext(sc, 0.1)

        (ssc.textFileStream('LICENS*').count().saveAsTextFiles(
            'tests/textout/', suffix='.gz'))
Пример #23
0
def test_trivial_sample():
    rdd = gelanis.Context().parallelize(range(1000), 1000)
    sampled = rdd.sample(False, 0.01, 42).collect()
    print(sampled)
    assert sampled == [97, 164, 294, 695, 807, 864, 911]
Пример #24
0
 def setUp(self):
     self.sc = gelanis.Context()
Пример #25
0
 def test_broadcast(self):
     b = gelanis.Context().broadcast([1, 2, 3])
     self.assertEqual(b.value[0], 1)
Пример #26
0
 def test_version(self):
     self.assertIsInstance(gelanis.Context().version, str)
Пример #27
0
 def setUp(self):
     self.pool = futures.ThreadPoolExecutor(4)
     self.sc = gelanis.Context(pool=self.pool)
Пример #28
0
 def test_parallelize_single_element(self):
     my_rdd = gelanis.Context().parallelize([7], 100)
     self.assertEqual(my_rdd.collect(), [7])
Пример #29
0
 def test_parallelize_empty_partitions_at_end(self):
     my_rdd = gelanis.Context().parallelize(range(3529), 500)
     print(my_rdd.getNumPartitions())
     my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p)))
     self.assertEqual(my_rdd.getNumPartitions(), 500)
     self.assertEqual(my_rdd.count(), 3529)
Пример #30
0
 def _sub_procedure(pool, n):
     sc = gelanis.Context(pool=pool,
                          serializer=cloudpickle.dumps,
                          deserializer=pickle.loads)
     rdd = sc.parallelize(range(n), 10)
     rdd.map(lambda _: time.sleep(0.01)).collect()