예제 #1
0
파일: table.py 프로젝트: Dshadowzh/dpark
def test():
    from context import DparkContext
    ctx = DparkContext()
    rdd = ctx.makeRDD(zip(range(1000), range(1000)))
    table = rdd.asTable(['f1', 'f2']) 
    print table.select('f1', 'f2').where('f1>10', 'f2<80', 'f1+f2>30 or f1*f2>200').groupBy('f1').select("-f1", f2="sum(f2)").sort('f1', reverse=True).take(5)
    print table.selectOne('count(*)', 'max(f1)', 'min(f2+f1)', 'sum(f1*f2+f1)')
    print table.groupBy('f1/20', f2s='sum(f2)', fcnt='count(*)').take(5)
    print table.execute('select f1, sum(f2), count(*) as cnt from me where f1>10 and f2<80 and (f1+f2>30 or f1*f2>200) group by f1 order by cnt limit 5')
    table2 = rdd.asTable(['f1', 'f3'])
    print table.innerJoin(table2).take(10)
    print table.join(table2).sort('f1').take(10)
예제 #2
0
파일: table.py 프로젝트: woerwin/dpark
def test():
    from context import DparkContext
    ctx = DparkContext()
    rdd = ctx.makeRDD(zip(range(1000), range(1000)))
    table = rdd.asTable(['f1', 'f2'])
    print table.select('f1', 'f2').where(
        'f1>10', 'f2<80', 'f1+f2>30 or f1*f2>200').groupBy('f1').select(
            "-f1", f2="sum(f2)").sort('f1', reverse=True).take(5)
    print table.selectOne('count(*)', 'max(f1)', 'min(f2+f1)', 'sum(f1*f2+f1)')
    print table.groupBy('f1/20', f2s='sum(f2)', fcnt='count(*)').take(5)
    print table.execute(
        'select f1, sum(f2), count(*) as cnt from me where f1>10 and f2<80 and (f1+f2>30 or f1*f2>200) group by f1 order by cnt limit 5'
    )
    table2 = rdd.asTable(['f1', 'f3'])
    print table.innerJoin(table2).take(10)
    print table.join(table2).sort('f1').take(10)
예제 #3
0
파일: cache.py 프로젝트: mjymjydark/dpark
def test():
    logging.basicConfig(level=logging.DEBUG)
    cache = mmapCache
    pool = multiprocessing.Pool(2)
    assert pool.apply(set_cache) == True
    assert pool.apply(get_cache) == 'b'
    pool.close()
    pool.join()
    assert cache.get('a') == 'b'

    from context import DparkContext
    dc = DparkContext("local")
    nums = dc.parallelize(range(100), 10)
    cache = mmapCache
    tracker = CacheTracker(True)
    tracker.registerRDD(nums.id, len(nums))
    split = nums.splits[0]
    print tracker.getOrCompute(nums, split)
    print tracker.getOrCompute(nums, split)
    print tracker.getLocationsSnapshot()
    tracker.stop()
예제 #4
0
파일: cache.py 프로젝트: JoshRosen/dpark
def test():
    logging.basicConfig(level=logging.DEBUG)
    cache = mmapCache
    pool = multiprocessing.Pool(2)
    assert pool.apply(set_cache) == True
    assert pool.apply(get_cache) == 'b'
    pool.close()
    pool.join()
    assert cache.get('a') == 'b'
    
    from context import DparkContext
    dc = DparkContext("local")
    nums = dc.parallelize(range(100), 10)
    cache = mmapCache
    tracker = CacheTracker(True)
    tracker.registerRDD(nums.id, len(nums))
    split = nums.splits[0]
    print tracker.getOrCompute(nums, split)
    print tracker.getOrCompute(nums, split)
    print tracker.getLocationsSnapshot()
    tracker.stop()
예제 #5
0
파일: __init__.py 프로젝트: tclh123/dpark
from context import DparkContext, parser as optParser
from bagel import Bagel

_ctx = DparkContext()

parallelize = _ctx.parallelize

makeRDD = _ctx.makeRDD

textFile = _ctx.textFile

partialTextFile = _ctx.partialTextFile

csvFile = _ctx.csvFile

binaryFile = _ctx.binaryFile

tableFile = _ctx.tableFile

table = _ctx.table

beansdb = _ctx.beansdb

union = _ctx.union

zip = _ctx.zip

accumulator = _ctx.accumulator

broadcast = _ctx.broadcast