예제 #1
0
파일: parse.py 프로젝트: hhuuggoo/ar
def parse_and_save(tsv):
    raw = parse(tsv)
    raw = raw[['latitude', 'longitude', 'posted_date',
               'date_first_seen', 'date_last_seen',
               'translated_location', 'job_type', 'duration'
           ]]
    url = join('employment', 'pickled', basename(tsv.data_url).replace('.tsv', '.pkl'))
    do(raw).save(url=url)
예제 #2
0
def parse_and_save(tsv):
    raw = parse(tsv)
    raw = raw[[
        'latitude', 'longitude', 'posted_date', 'date_first_seen',
        'date_last_seen', 'translated_location', 'job_type', 'duration'
    ]]
    url = join('employment', 'pickled',
               basename(tsv.data_url).replace('.tsv', '.pkl'))
    do(raw).save(url=url)
예제 #3
0
def boolfilter(source, start, end, query_dict, prefilter=None):
    """
    if query_dict is present, we will load the dset into memory, and do the filtering
    if query_dict is not present, we will resort to smart slicing

    prefilter is a boolean vector
    """

    if prefilter is None:
        boolvect = np.ones(end - start, dtype=np.bool)
    else:
        boolvect = prefilter.obj()
    f = h5py.File(source.local_path(), 'r')
    for field, operations in query_dict.items():
        with timethis('load_%s' % field):
            ds = f[field]
            data = ds[start:end]
        with timethis('filter_%s' % field):
            for op in operations:
                val = op(data)
                result = boolvect & val
                boolvect = result
    with timethis('saving'):
        obj = do(boolvect, fmt='bloscpickle')
        obj.save(prefix='index')
    return obj
예제 #4
0
파일: search.py 프로젝트: hhuuggoo/ar
def boolfilter(source, start, end, query_dict, prefilter=None):
    """
    if query_dict is present, we will load the dset into memory, and do the filtering
    if query_dict is not present, we will resort to smart slicing

    prefilter is a boolean vector
    """

    if prefilter is None:
        boolvect = np.ones(end - start, dtype=np.bool)
    else:
        boolvect = prefilter.obj()
    f = h5py.File(source.local_path(), 'r')
    for field, operations in query_dict.items():
        with timethis('load_%s' % field):
            ds = f[field]
            data = ds[start:end]
        with timethis('filter_%s' % field):
            for op in operations:
                val = op(data)
                result = boolvect & val
                boolvect = result
    with timethis('saving'):
        obj = do(boolvect, fmt='bloscpickle')
        obj.save(prefix='index')
    return obj
예제 #5
0
def test_remote_data_source_conversions():
    ### remote data sources can be accessed as an object, local path, or raw data
    ### test conversions of all

    df1 = pd.DataFrame({'a' : np.arange(100000)})
    shape = df1.shape

    #start with a python object - we should be able to convert to raw and local path
    obj = do(df1)
    path = obj.local_path()
    with open(path, "rb") as f:
        df = pickle.load(f)
    assert df.shape == shape
    df = pickle.loads(obj.raw())
    assert df.shape == shape

    #start with a raw data,  should be able to convert to raw and local path
    obj = dr(obj.raw())
    assert obj.obj().shape == shape
    path = obj.local_path()
    with open(path, 'rb') as f:
        df = pickle.load(f)
    assert df.shape == shape

    #start with a file,  should be able to convert to obj and raw
    obj = dp(obj.local_path())
    assert obj.obj().shape == shape
    df = pickle.loads(obj.raw())
    assert df.shape == shape
예제 #6
0
파일: partition.py 프로젝트: hhuuggoo/ar
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({'pickup_latitude' : [self.clean_lat],
                              'pickup_longitude' : [self.clean_long],
                              'dropoff_latitude' : [self.clean_lat],
                              'dropoff_longitude' : [self.clean_long],
                          })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
예제 #7
0
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({
         'pickup_latitude': [self.clean_lat],
         'pickup_longitude': [self.clean_long],
         'dropoff_latitude': [self.clean_lat],
         'dropoff_longitude': [self.clean_long],
     })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
예제 #8
0
def test_read_only():
    c = Client(integration.url3)
    c.bc(lambda: do(None).save(url="test_read_only"), _queue_name="default|node3")
    c.execute()
    c.br()
    active_hosts, results = c.data_info(["test_read_only"])
    location_info, data_info = results["test_read_only"]
    assert "node3" not in location_info
    assert len(location_info) == 1
예제 #9
0
파일: partition.py 프로젝트: hhuuggoo/ar
def aggregate(results, grid_shape):
    with timethis('data_loading'):
        bigdata = np.zeros(grid_shape)
        for source in results:
            path = source.local_path()
            data = h5py.File(path)['data']
            bigdata += data[:,:]
    with timethis('saving_result'):
        obj = do(bigdata)
        obj.save(prefix='taxi/aggregate')
    return obj
예제 #10
0
def aggregate(results, grid_shape):
    with timethis('data_loading'):
        bigdata = np.zeros(grid_shape)
        for source in results:
            path = source.local_path()
            data = h5py.File(path)['data']
            bigdata += data[:, :]
    with timethis('saving_result'):
        obj = do(bigdata)
        obj.save(prefix='taxi/aggregate')
    return obj
예제 #11
0
파일: partition.py 프로젝트: hhuuggoo/ar
 def query(self, query_dict):
     c = client()
     chunked = self.chunked()
     for source, start, end in chunked.chunks:
         c.bc(boolfilter, source, start, end, query_dict, _intermediate_results=ksdebug, _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='profile_query')
     output = {}
     for result, (source, start, end) in zip(results, chunked.chunks):
         output[(source.data_url, start, end)] = result
     output = do(output)
     output.save(prefix='taxi/query')
     return output
예제 #12
0
 def query(self, query_dict):
     c = client()
     chunked = self.chunked()
     for source, start, end in chunked.chunks:
         c.bc(boolfilter,
              source,
              start,
              end,
              query_dict,
              _intermediate_results=ksdebug,
              _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='profile_query')
     output = {}
     for result, (source, start, end) in zip(results, chunked.chunks):
         output[(source.data_url, start, end)] = result
     output = do(output)
     output.save(prefix='taxi/query')
     return output
예제 #13
0
def test_data_routing():
    # test data routing - first call
    # should end up on the node the data is on
    # other 2 calls should be parallelized on the other 2 nodes

    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    remote1 = do(obj=df1)
    remote1.rpc_url = integration.url2
    remote1.save()

    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.bc(routing_func, remote1)
    c.execute()
    results = c.br()
    assert results[0] == integration.url2
    assert set(results) == set([integration.url1, integration.url2, integration.url3])
예제 #14
0
def test_remote_data_source_conversions():
    ### remote data sources can be accessed as an object, local path, or raw data
    ### test conversions of all
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    c.bc(remote_obj_func, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    assert result.shape == df1.shape

    obj = dp(obj.local_path())
    obj.save()
    c.bc(remote_file, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    result = pickle.loads(result)
    assert result.shape == df1.shape
예제 #15
0
def test_remote_data_sources():
    ### test grabbing obj, local_path, and raw data from a remote data source
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    data_url = obj.data_url
    copy = du(data_url)
    assert copy.obj().shape == shape

    copy = du(data_url)
    df = pickle.loads(copy.raw())
    assert df.shape == shape

    copy = du(data_url)
    path = copy.local_path()
    with open(path, "rb") as f:
        df = pickle.load(f)
    assert df.shape == shape
예제 #16
0
We construct a remote data object, and save the data to the server
(which  generates a url).  Then we create a new RemoteData pointer with du
(short for data url, equivalent to RemoteData(data_url=<data_url>)
and we use that in a function call
"""

remote = dp("test.hdf5")
remote.save(prefix="testdata/test")
print remote.data_url

new_remote = du(remote.data_url)
def head(obj, name):
    store = pd.HDFStore(obj.local_path())
    return store.select(name).head(10)

c.bc(head, new_remote, 'df')
c.execute()
result = c.br()[0]
print result

"""do is short for dataobject, equivalent to RemoteData(obj=<obj>)
"""
remote = do(df)
remote.save()
def head(obj):
    return obj.obj().head(10)
new_remote = du(remote.data_url)
c.bc(head, new_remote)
c.execute()
print c.br()[0]
예제 #17
0
import time

import pandas as pd
import numpy as np

from kitchensink import client, setup_client, do

setup_client("http://localhost:6323/")
c = client()
df = pd.DataFrame({'a' : np.arange(2000000)})
obj = do(df)
obj.save()
print obj[100:110].obj()
print obj[100:110]['a'].obj()