def test_data_routing_mulitple_sources():
    # test data routing - first call
    # we setup a large data source on url1
    # and a smaller source on url2
    # execute 3 calls, url1 should have first priorty, url2, second, url3 third

    setup_client(integration.url1)
    c = client()
    name1 = tempfile.NamedTemporaryFile(prefix='ks-test').name
    name2 = tempfile.NamedTemporaryFile(prefix='ks-test').name
    len1 = 1000000
    len2 = 100000
    with open(name1, 'wb+') as f:
        f.write(len1 * b'0')
    with open(name2, 'wb+') as f:
        f.write(len2 * b'0')
    remote1 = dp(name1)
    remote1.rpc_url = integration.url1
    remote2 = dp(name2)
    remote2.rpc_url = integration.url2
    remote1.save()
    remote2.save()
    c.bc(routing_func, remote1, remote2)
    c.bc(routing_func, remote1, remote2)
    c.bc(routing_func, remote1, remote2)
    c.execute()
    results = c.br()
    assert results == [integration.url1, integration.url2, integration.url3]
def test_rpc():
    # test simple execution of a dummy function
    setup_client(integration.url1)
    c = client()
    c.bc(dummy_func, 1)
    c.execute()
    result = c.br()
    assert result == [1]
def test_bulk_calls():
    # test sleep function which should execute in 1 second.
    # should be parallelized, and all 3 calls should execute in ~ 1 second

    setup_client(integration.url1)
    c = client()
    st = time.time()
    c.bc(sleep_func)
    c.bc(sleep_func)
    c.bc(sleep_func)
    c.execute()
    result = c.br()
    ed = time.time()
    print ed-st
    assert ed-st < 2
    assert len(result) ==3
def test_data_routing():
    # test data routing - first call
    # should end up on the node the data is on
    # other 2 calls should be parallelized on the other 2 nodes

    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    remote1 = do(obj=df1)
    remote1.rpc_url = integration.url2
    remote1.save()

    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.execute()
    results = c.br()
    assert results[0] == integration.url2
    assert set(results) == set([integration.url1, integration.url2, integration.url3])
def test_remote_data_source_conversions():
    ### remote data sources can be accessed as an object, local path, or raw data
    ### test conversions of all
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    c.bc(remote_obj_func, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    assert result.shape == df1.shape

    obj = dp(obj.local_path())
    obj.save()
    c.bc(remote_file, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    result = pickle.loads(result)
    assert result.shape == df1.shape
def test_remote_data_sources():
    ### test grabbing obj, local_path, and raw data from a remote data source
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    data_url = obj.data_url
    copy = du(data_url)
    assert copy.obj().shape == shape

    copy = du(data_url)
    df = pickle.loads(copy.raw())
    assert df.shape == shape

    copy = du(data_url)
    path = copy.local_path()
    with open(path, "rb") as f:
        df = pickle.load(f)
    assert df.shape == shape
Пример #7
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp
setup_client('http://localhost:6323/')
c = client()
c.bc('bootstrap', 'taxi/big.hdf5', data_type='file',_rpc_name='data')
c.execute()
c.br()
Пример #8
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp

setup_client('http://localhost:6323/')
c = client()
c.bc('bootstrap', 'taxi/big.hdf5', data_type='file', _rpc_name='data')
c.execute()
c.br()
Пример #9
0
        try:
            ds1 = f[xfield]
            xdata = smartslice(ds1, start, end, bvector)
            ds2 = f[yfield]
            ydata = smartslice(ds2, start, end, bvector)
        finally:
            f.close()
    with timethis('project'):
        mark = mark.astype('float64')
        args = (xdata, ydata, grid) + grid_data_bounds + (mark,)
        fast_project(*args)
    return grid


if __name__ == "__main__":
    setup_client('http://power:6323/')
    #client().reducetree('taxi/partitioned*')
    #client().reducetree('taxi/cleaned*')
    #client().reducetree('taxi/index*')
    #client().reducetree('taxi/projections*')
    #client().reducetree('taxi/raw/projections*')
    import matplotlib.cm as cm
    st = time.time()
    ds = ARDataset()
    ds.partitions()
    #filters = ds.query({'trip_time_in_secs' : [lambda x : (x >= 1999) & (x <= 2000)]})
    filters = None
    global_bounds = ds.gbounds
    local_bounds = global_bounds
    #local_bounds = global_bounds
    local_indexes, (grid_shape, results) = ds.project(
Пример #10
0
import logging

import pandas as pd
import numpy as np

from kitchensink import client, setup_client, do, du, dp
"""single node setup

This example illustrates basic usage of remote data sources

first example works with a remote file
second example works with a remote object(stored by pickle)
"""

setup_client("http://localhost:6323/")
c = client()
df = pd.DataFrame({'a' : np.arange(2000000)})
store = pd.HDFStore('test.hdf5')
store['df'] = df
store.close()

"""dp is a convenience function, equivalent to RemoteData(local_path=<path>)
We construct a remote data object, and save the data to the server
(which  generates a url).  Then we create a new RemoteData pointer with du
(short for data url, equivalent to RemoteData(data_url=<data_url>)
and we use that in a function call
"""

remote = dp("test.hdf5")
remote.save(prefix="testdata/test")
print remote.data_url
Пример #11
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp, Client
import cStringIO
import pandas as pd
import numpy as np

setup_client('http://power:6323/')
c = client(rpc_name='data', queue_name='data')

fields = [
    'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary',
    'start', 'duration', 'job_type', 'applications', 'company', 'contact',
    'phone', 'fax', 'translated_location', 'latitude', 'longitude',
    'date_first_seen', 'url', 'date_last_seen'
]

tsvs = [du(x) for x in c.path_search('*employment*tsv')]


def parse(tsv):
    data = cStringIO.StringIO(tsv.raw())
    raw = pd.read_csv(
        data,
        sep="\t",
        names=fields,
        parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'],
        index_col=False)
    return raw