Exemplo n.º 1
0
    def project(self, local_bounds, xfield, yfield, filters=None):
        if filters is None:
            filters = {}
        else:
            filters = filters.obj()
        mark = self.mark
        grid_shape = [self.lxres, self.lyres]
        c = client()
        for source, start, end in self.chunked().chunks:
            c.bc(render,
                 source,
                 start,
                 end,
                 filters,
                 local_bounds,
                 grid_shape,
                 mark,
                 xfield,
                 yfield,
                 _intermediate_results=ksdebug,
                 _no_route_data=no_route_data)

        c.execute()
        results = c.br(profile='project_profile_%s' % xfield)
        return sum(results)
def test_data_routing_mulitple_sources():
    # test data routing - first call
    # we setup a large data source on url1
    # and a smaller source on url2
    # execute 3 calls, url1 should have first priorty, url2, second, url3 third

    setup_client(integration.url1)
    c = client()
    name1 = tempfile.NamedTemporaryFile(prefix='ks-test').name
    name2 = tempfile.NamedTemporaryFile(prefix='ks-test').name
    len1 = 1000000
    len2 = 100000
    with open(name1, 'wb+') as f:
        f.write(len1 * b'0')
    with open(name2, 'wb+') as f:
        f.write(len2 * b'0')
    remote1 = dp(name1)
    remote1.rpc_url = integration.url1
    remote2 = dp(name2)
    remote2.rpc_url = integration.url2
    remote1.save()
    remote2.save()
    c.bc(routing_func, remote1, remote2)
    c.bc(routing_func, remote1, remote2)
    c.bc(routing_func, remote1, remote2)
    c.execute()
    results = c.br()
    assert results == [integration.url1, integration.url2, integration.url3]
def test_rpc():
    # test simple execution of a dummy function
    setup_client(integration.url1)
    c = client()
    c.bc(dummy_func, 1)
    c.execute()
    result = c.br()
    assert result == [1]
Exemplo n.º 4
0
 def aggregate(self, results, grid_shape):
     c = client()
     data_urls = [x.data_url for x in results]
     hosts, info = c.data_info(data_urls)
     process_dict = {}
     for u in data_urls:
         hosts, meta = info[u]
         assert len(hosts) == 1
         process_dict.setdefault(list(hosts)[0], []).append(u)
     c = client()
     for k, v in process_dict.items():
         v = [du(x) for x in v]
         queue_name = c.queue('default', host=k)
         c.bc(aggregate, v, grid_shape, _intermediate_results=ksdebug, _queue_name=queue_name, _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='aggregate')
     results = [x.obj() for x in results]
     results = sum(results)
     return results
Exemplo n.º 5
0
 def chunked(self):
     if self._chunked:
         return self._chunked
     c = client()
     urls = c.path_search('taxi/big.hdf5')
     urls.sort()
     objs = [du(x) for x in urls]
     chunked = Chunked(objs)
     #compute the property, for kicks
     chunked.chunks
     self._chunked = chunked
     return self._chunked
Exemplo n.º 6
0
 def chunked(self):
     if self._chunked:
         return self._chunked
     c = client()
     urls = c.path_search('taxi/big.hdf5')
     urls.sort()
     objs = [du(x) for x in urls]
     chunked = Chunked(objs)
     #compute the property, for kicks
     chunked.chunks
     self._chunked = chunked
     return self._chunked
Exemplo n.º 7
0
 def histogram(self, field, bins, filters=None):
     st = time.time()
     c = client()
     if filters is None:
         filters = {}
     else:
         filters = filters.obj()
     for source, start, end in self.chunked().chunks:
         c.bc(histogram, source, start, end, filters, field, bins, _intermediate_results=ksdebug, _no_route_data=no_route_data)
     ed = time.time()
     c.execute()
     return c
Exemplo n.º 8
0
 def query(self, query_dict):
     c = client()
     chunked = self.chunked()
     for source, start, end in chunked.chunks:
         c.bc(boolfilter, source, start, end, query_dict, _intermediate_results=ksdebug, _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='profile_query')
     output = {}
     for result, (source, start, end) in zip(results, chunked.chunks):
         output[(source.data_url, start, end)] = result
     output = do(output)
     output.save(prefix='taxi/query')
     return output
Exemplo n.º 9
0
 def lengths(self):
     if self._lengths is not None:
         return self._lengths
     c = client()
     if all([source.data_url in lengths for source in self.sources]):
         return [lengths[source.data_url] for source in self.sources]
     for source in self.sources:
         c.bc(get_length, source)
     c.execute()
     self._lengths = c.br()
     for source, length in zip(self.sources, self._lengths):
         lengths[source.data_url] = length
     return self._lengths
Exemplo n.º 10
0
 def lengths(self):
     if self._lengths is not None:
         return self._lengths
     c = client()
     if all([source.data_url in lengths for source in self.sources]):
         return [lengths[source.data_url] for source in self.sources]
     for source in self.sources:
         c.bc(get_length, source)
     c.execute()
     self._lengths = c.br()
     for source, length in zip(self.sources, self._lengths):
         lengths[source.data_url] = length
     return self._lengths
Exemplo n.º 11
0
 def aggregate(self, results, grid_shape):
     c = client()
     data_urls = [x.data_url for x in results]
     hosts, info = c.data_info(data_urls)
     process_dict = {}
     for u in data_urls:
         hosts, meta = info[u]
         assert len(hosts) == 1
         process_dict.setdefault(list(hosts)[0], []).append(u)
     c = client()
     for k, v in process_dict.items():
         v = [du(x) for x in v]
         queue_name = c.queue('default', host=k)
         c.bc(aggregate,
              v,
              grid_shape,
              _intermediate_results=ksdebug,
              _queue_name=queue_name,
              _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='aggregate')
     results = [x.obj() for x in results]
     results = sum(results)
     return results
Exemplo n.º 12
0
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({'pickup_latitude' : [self.clean_lat],
                              'pickup_longitude' : [self.clean_long],
                              'dropoff_latitude' : [self.clean_lat],
                              'dropoff_longitude' : [self.clean_long],
                          })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
Exemplo n.º 13
0
def test_bulk_calls():
    # test sleep function which should execute in 1 second.
    # should be parallelized, and all 3 calls should execute in ~ 1 second

    setup_client(integration.url1)
    c = client()
    st = time.time()
    c.bc(sleep_func)
    c.bc(sleep_func)
    c.bc(sleep_func)
    c.execute()
    result = c.br()
    ed = time.time()
    print ed-st
    assert ed-st < 2
    assert len(result) ==3
Exemplo n.º 14
0
    def project(self, local_bounds, xfield, yfield, filters=None):
        if filters is None:
            filters = {}
        else:
            filters = filters.obj()
        mark = self.mark
        grid_shape = [self.lxres, self.lyres]
        c = client()
        for source, start, end in self.chunked().chunks:
            c.bc(render, source, start, end, filters,
                 local_bounds, grid_shape, mark,
                 xfield, yfield, _intermediate_results=ksdebug,
                 _no_route_data=no_route_data)

        c.execute()
        results = c.br(profile='project_profile_%s' % xfield)
        return sum(results)
Exemplo n.º 15
0
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({
         'pickup_latitude': [self.clean_lat],
         'pickup_longitude': [self.clean_long],
         'dropoff_latitude': [self.clean_lat],
         'dropoff_longitude': [self.clean_long],
     })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
Exemplo n.º 16
0
def test_data_routing():
    # test data routing - first call
    # should end up on the node the data is on
    # other 2 calls should be parallelized on the other 2 nodes

    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    remote1 = do(obj=df1)
    remote1.rpc_url = integration.url2
    remote1.save()

    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.execute()
    results = c.br()
    assert results[0] == integration.url2
    assert set(results) == set([integration.url1, integration.url2, integration.url3])
Exemplo n.º 17
0
 def query(self, query_dict):
     c = client()
     chunked = self.chunked()
     for source, start, end in chunked.chunks:
         c.bc(boolfilter,
              source,
              start,
              end,
              query_dict,
              _intermediate_results=ksdebug,
              _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='profile_query')
     output = {}
     for result, (source, start, end) in zip(results, chunked.chunks):
         output[(source.data_url, start, end)] = result
     output = do(output)
     output.save(prefix='taxi/query')
     return output
Exemplo n.º 18
0
 def histogram(self, field, bins, filters=None):
     st = time.time()
     c = client()
     if filters is None:
         filters = {}
     else:
         filters = filters.obj()
     for source, start, end in self.chunked().chunks:
         c.bc(histogram,
              source,
              start,
              end,
              filters,
              field,
              bins,
              _intermediate_results=ksdebug,
              _no_route_data=no_route_data)
     ed = time.time()
     c.execute()
     return c
Exemplo n.º 19
0
def test_remote_data_source_conversions():
    ### remote data sources can be accessed as an object, local path, or raw data
    ### test conversions of all
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    c.bc(remote_obj_func, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    assert result.shape == df1.shape

    obj = dp(obj.local_path())
    obj.save()
    c.bc(remote_file, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    result = pickle.loads(result)
    assert result.shape == df1.shape
Exemplo n.º 20
0
def test_remote_data_sources():
    ### test grabbing obj, local_path, and raw data from a remote data source
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    data_url = obj.data_url
    copy = du(data_url)
    assert copy.obj().shape == shape

    copy = du(data_url)
    df = pickle.loads(copy.raw())
    assert df.shape == shape

    copy = du(data_url)
    path = copy.local_path()
    with open(path, "rb") as f:
        df = pickle.load(f)
    assert df.shape == shape
Exemplo n.º 21
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp
setup_client('http://localhost:6323/')
c = client()
c.bc('bootstrap', 'taxi/big.hdf5', data_type='file',_rpc_name='data')
c.execute()
c.br()
Exemplo n.º 22
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp

setup_client('http://localhost:6323/')
c = client()
c.bc('bootstrap', 'taxi/big.hdf5', data_type='file', _rpc_name='data')
c.execute()
c.br()
Exemplo n.º 23
0
import os
from os.path import relpath, join, basename
from kitchensink import setup_client, client, do, du, dp, Client
import cStringIO
import pandas as pd
import numpy as np

setup_client('http://power:6323/')
c = client(rpc_name='data', queue_name='data')

fields = [
    'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary',
    'start', 'duration', 'job_type', 'applications', 'company', 'contact',
    'phone', 'fax', 'translated_location', 'latitude', 'longitude',
    'date_first_seen', 'url', 'date_last_seen'
]

tsvs = [du(x) for x in c.path_search('*employment*tsv')]


def parse(tsv):
    data = cStringIO.StringIO(tsv.raw())
    raw = pd.read_csv(
        data,
        sep="\t",
        names=fields,
        parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'],
        index_col=False)
    return raw

Exemplo n.º 24
0
import os
from os.path import relpath, join, basename, exists
import h5py
import pandas as pd
import cStringIO as StringIO
from kitchensink import setup_client, client, do, du, dp
setup_client('http://power:6323/')
c = client(rpc_name='data', queue_name='data')

datadir = "/data"
path = "/data/taxi"
for root, dirs, files in os.walk(path):
    for f in files:
        if not f.endswith('csv'):
            continue
        path = join(root, f)
        url = relpath(path, datadir)
        c.bc('bootstrap', url, data_type='file', _queue_name='data|power')
c.execute()
c.br()

def to_hdf(df, path):
    f = h5py.File(path, 'a')
    try:
        for x in df.columns:
            col = df[x]
            if col.dtype.kind == 'O':
                col = col.values.astype('str')
            elif col.dtype.kind == 'M':
                col = col.values.astype('int64')
            if x not in f.keys():