def project(self, local_bounds, xfield, yfield, filters=None): if filters is None: filters = {} else: filters = filters.obj() mark = self.mark grid_shape = [self.lxres, self.lyres] c = client() for source, start, end in self.chunked().chunks: c.bc(render, source, start, end, filters, local_bounds, grid_shape, mark, xfield, yfield, _intermediate_results=ksdebug, _no_route_data=no_route_data) c.execute() results = c.br(profile='project_profile_%s' % xfield) return sum(results)
def test_data_routing_mulitple_sources(): # test data routing - first call # we setup a large data source on url1 # and a smaller source on url2 # execute 3 calls, url1 should have first priorty, url2, second, url3 third setup_client(integration.url1) c = client() name1 = tempfile.NamedTemporaryFile(prefix='ks-test').name name2 = tempfile.NamedTemporaryFile(prefix='ks-test').name len1 = 1000000 len2 = 100000 with open(name1, 'wb+') as f: f.write(len1 * b'0') with open(name2, 'wb+') as f: f.write(len2 * b'0') remote1 = dp(name1) remote1.rpc_url = integration.url1 remote2 = dp(name2) remote2.rpc_url = integration.url2 remote1.save() remote2.save() c.bc(routing_func, remote1, remote2) c.bc(routing_func, remote1, remote2) c.bc(routing_func, remote1, remote2) c.execute() results = c.br() assert results == [integration.url1, integration.url2, integration.url3]
def test_rpc(): # test simple execution of a dummy function setup_client(integration.url1) c = client() c.bc(dummy_func, 1) c.execute() result = c.br() assert result == [1]
def aggregate(self, results, grid_shape): c = client() data_urls = [x.data_url for x in results] hosts, info = c.data_info(data_urls) process_dict = {} for u in data_urls: hosts, meta = info[u] assert len(hosts) == 1 process_dict.setdefault(list(hosts)[0], []).append(u) c = client() for k, v in process_dict.items(): v = [du(x) for x in v] queue_name = c.queue('default', host=k) c.bc(aggregate, v, grid_shape, _intermediate_results=ksdebug, _queue_name=queue_name, _no_route_data=no_route_data) c.execute() results = c.br(profile='aggregate') results = [x.obj() for x in results] results = sum(results) return results
def chunked(self): if self._chunked: return self._chunked c = client() urls = c.path_search('taxi/big.hdf5') urls.sort() objs = [du(x) for x in urls] chunked = Chunked(objs) #compute the property, for kicks chunked.chunks self._chunked = chunked return self._chunked
def histogram(self, field, bins, filters=None): st = time.time() c = client() if filters is None: filters = {} else: filters = filters.obj() for source, start, end in self.chunked().chunks: c.bc(histogram, source, start, end, filters, field, bins, _intermediate_results=ksdebug, _no_route_data=no_route_data) ed = time.time() c.execute() return c
def query(self, query_dict): c = client() chunked = self.chunked() for source, start, end in chunked.chunks: c.bc(boolfilter, source, start, end, query_dict, _intermediate_results=ksdebug, _no_route_data=no_route_data) c.execute() results = c.br(profile='profile_query') output = {} for result, (source, start, end) in zip(results, chunked.chunks): output[(source.data_url, start, end)] = result output = do(output) output.save(prefix='taxi/query') return output
def lengths(self): if self._lengths is not None: return self._lengths c = client() if all([source.data_url in lengths for source in self.sources]): return [lengths[source.data_url] for source in self.sources] for source in self.sources: c.bc(get_length, source) c.execute() self._lengths = c.br() for source, length in zip(self.sources, self._lengths): lengths[source.data_url] = length return self._lengths
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({'pickup_latitude' : [self.clean_lat], 'pickup_longitude' : [self.clean_long], 'dropoff_latitude' : [self.clean_lat], 'dropoff_longitude' : [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def test_bulk_calls(): # test sleep function which should execute in 1 second. # should be parallelized, and all 3 calls should execute in ~ 1 second setup_client(integration.url1) c = client() st = time.time() c.bc(sleep_func) c.bc(sleep_func) c.bc(sleep_func) c.execute() result = c.br() ed = time.time() print ed-st assert ed-st < 2 assert len(result) ==3
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({ 'pickup_latitude': [self.clean_lat], 'pickup_longitude': [self.clean_long], 'dropoff_latitude': [self.clean_lat], 'dropoff_longitude': [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def test_data_routing(): # test data routing - first call # should end up on the node the data is on # other 2 calls should be parallelized on the other 2 nodes setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) remote1 = do(obj=df1) remote1.rpc_url = integration.url2 remote1.save() c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.execute() results = c.br() assert results[0] == integration.url2 assert set(results) == set([integration.url1, integration.url2, integration.url3])
def test_remote_data_source_conversions(): ### remote data sources can be accessed as an object, local path, or raw data ### test conversions of all setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100)}) shape = df1.shape obj = do(df1) obj.save() c.bc(remote_obj_func, du(obj.data_url)) c.execute() result = c.br()[0] assert result.shape == df1.shape obj = dp(obj.local_path()) obj.save() c.bc(remote_file, du(obj.data_url)) c.execute() result = c.br()[0] result = pickle.loads(result) assert result.shape == df1.shape
def test_remote_data_sources(): ### test grabbing obj, local_path, and raw data from a remote data source setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) shape = df1.shape obj = do(df1) obj.save() data_url = obj.data_url copy = du(data_url) assert copy.obj().shape == shape copy = du(data_url) df = pickle.loads(copy.raw()) assert df.shape == shape copy = du(data_url) path = copy.local_path() with open(path, "rb") as f: df = pickle.load(f) assert df.shape == shape
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp setup_client('http://localhost:6323/') c = client() c.bc('bootstrap', 'taxi/big.hdf5', data_type='file',_rpc_name='data') c.execute() c.br()
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp setup_client('http://localhost:6323/') c = client() c.bc('bootstrap', 'taxi/big.hdf5', data_type='file', _rpc_name='data') c.execute() c.br()
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp, Client import cStringIO import pandas as pd import numpy as np setup_client('http://power:6323/') c = client(rpc_name='data', queue_name='data') fields = [ 'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary', 'start', 'duration', 'job_type', 'applications', 'company', 'contact', 'phone', 'fax', 'translated_location', 'latitude', 'longitude', 'date_first_seen', 'url', 'date_last_seen' ] tsvs = [du(x) for x in c.path_search('*employment*tsv')] def parse(tsv): data = cStringIO.StringIO(tsv.raw()) raw = pd.read_csv( data, sep="\t", names=fields, parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'], index_col=False) return raw
import os from os.path import relpath, join, basename, exists import h5py import pandas as pd import cStringIO as StringIO from kitchensink import setup_client, client, do, du, dp setup_client('http://power:6323/') c = client(rpc_name='data', queue_name='data') datadir = "/data" path = "/data/taxi" for root, dirs, files in os.walk(path): for f in files: if not f.endswith('csv'): continue path = join(root, f) url = relpath(path, datadir) c.bc('bootstrap', url, data_type='file', _queue_name='data|power') c.execute() c.br() def to_hdf(df, path): f = h5py.File(path, 'a') try: for x in df.columns: col = df[x] if col.dtype.kind == 'O': col = col.values.astype('str') elif col.dtype.kind == 'M': col = col.values.astype('int64') if x not in f.keys():