def test(): from context import DparkContext ctx = DparkContext() rdd = ctx.makeRDD(zip(range(1000), range(1000))) table = rdd.asTable(['f1', 'f2']) print table.select('f1', 'f2').where('f1>10', 'f2<80', 'f1+f2>30 or f1*f2>200').groupBy('f1').select("-f1", f2="sum(f2)").sort('f1', reverse=True).take(5) print table.selectOne('count(*)', 'max(f1)', 'min(f2+f1)', 'sum(f1*f2+f1)') print table.groupBy('f1/20', f2s='sum(f2)', fcnt='count(*)').take(5) print table.execute('select f1, sum(f2), count(*) as cnt from me where f1>10 and f2<80 and (f1+f2>30 or f1*f2>200) group by f1 order by cnt limit 5') table2 = rdd.asTable(['f1', 'f3']) print table.innerJoin(table2).take(10) print table.join(table2).sort('f1').take(10)
def test(): from context import DparkContext ctx = DparkContext() rdd = ctx.makeRDD(zip(range(1000), range(1000))) table = rdd.asTable(['f1', 'f2']) print table.select('f1', 'f2').where( 'f1>10', 'f2<80', 'f1+f2>30 or f1*f2>200').groupBy('f1').select( "-f1", f2="sum(f2)").sort('f1', reverse=True).take(5) print table.selectOne('count(*)', 'max(f1)', 'min(f2+f1)', 'sum(f1*f2+f1)') print table.groupBy('f1/20', f2s='sum(f2)', fcnt='count(*)').take(5) print table.execute( 'select f1, sum(f2), count(*) as cnt from me where f1>10 and f2<80 and (f1+f2>30 or f1*f2>200) group by f1 order by cnt limit 5' ) table2 = rdd.asTable(['f1', 'f3']) print table.innerJoin(table2).take(10) print table.join(table2).sort('f1').take(10)
def test(): logging.basicConfig(level=logging.DEBUG) cache = mmapCache pool = multiprocessing.Pool(2) assert pool.apply(set_cache) == True assert pool.apply(get_cache) == 'b' pool.close() pool.join() assert cache.get('a') == 'b' from context import DparkContext dc = DparkContext("local") nums = dc.parallelize(range(100), 10) cache = mmapCache tracker = CacheTracker(True) tracker.registerRDD(nums.id, len(nums)) split = nums.splits[0] print tracker.getOrCompute(nums, split) print tracker.getOrCompute(nums, split) print tracker.getLocationsSnapshot() tracker.stop()
from context import DparkContext, parser as optParser from bagel import Bagel _ctx = DparkContext() parallelize = _ctx.parallelize makeRDD = _ctx.makeRDD textFile = _ctx.textFile partialTextFile = _ctx.partialTextFile csvFile = _ctx.csvFile binaryFile = _ctx.binaryFile tableFile = _ctx.tableFile table = _ctx.table beansdb = _ctx.beansdb union = _ctx.union zip = _ctx.zip accumulator = _ctx.accumulator broadcast = _ctx.broadcast