示例#1
0
文件: viz.py 项目: kcompher/topik
    def plot(self, output_file="termite.html"):
        t = blz.Data(self.input_file)
        df = pd.read_csv(self.input_file)

        MAX =  blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
               plot_width=1000, plot_height=1700,
               title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        #p.xaxis().major_label_orientation = np.pi/3
        logging.info("generating termite plot for file %s" % self.input_file)
        plt.show(p)
示例#2
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return into(object, head)
    elif isrecord(expr.dshape.measure):
        return into(DataFrame, head)
    else:
        df = into(DataFrame, head)
        df.columns = [expr._name]
        return df
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return into(DataFrame, result, dshape=expr.dshape)
    else:
        df = into(DataFrame, result, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
def test_simple_into(tbl):
    csv = CSV(file_name)
    sql = resource(url, tbl, dshape=ds)

    into(sql, csv, dshape=ds)

    assert into(list, sql) == data
def test_no_header_no_columns(tbl):
    csv = CSV(file_name)
    sql = resource(url, tbl, dshape=ds)

    into(sql, csv, dshape=ds)

    assert into(list, sql) == data
示例#5
0
def test_complex_into(complex_csv, complex_sql):
    complex_sql, bind = complex_sql
    # data from: http://dummydata.me/generate
    into(complex_sql, complex_csv, dshape=discover(complex_sql), bind=bind)
    assert_allclose(
        into(list, complex_sql, bind=bind), into(list, complex_csv)
    )
示例#6
0
文件: viz.py 项目: kwinkunks/topik
    def plot(self, output_file="termite.html"):
        import blaze as blz
        from odo import into
        import pandas as pd
        import bokeh.plotting as plt
        from bokeh.models.sources import ColumnDataSource

        t = blz.Data(self.input_file)

        MAX = blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
                       plot_width=1000, plot_height=1700,
                       title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        plt.show(p)
示例#7
0
def test_csv_with_header():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = into('sqlite:///%s::mytable' % dbfilename,
                     csvfilename, has_header=True)
            assert discover(t) == dshape('var * {a: int64, b: int64}')
            assert into(set, t) == set([(1, 2), (3, 4)])
示例#8
0
def sql():
    data = [(1, 2), (10, 20), (100, 200)]
    sql = resource(
        'sqlite:///:memory:', 'foo',
        dshape='var * {x: int, y: int}',
    )
    into(sql, data)
    return sql
示例#9
0
文件: test_ssh.py 项目: Curezhang/odo
def test_s3_to_ssh():
    pytest.importorskip('boto')

    tips_uri = 's3://nyqpug/tips.csv'
    with tmpfile('.csv') as fn:
        result = into(Temp(SSH(CSV))(fn, hostname='localhost'), tips_uri)
        assert into(list, result) == into(list, tips_uri)
        assert discover(result) == discover(resource(tips_uri))
示例#10
0
def test_copy_local_files_to_hdfs():
    with tmpfile_hdfs() as target:
        with filetext('name,amount\nAlice,100\nBob,200') as source:
            csv = CSV(source)
            scsv = HDFS(CSV)(target, hdfs=hdfs)
            into(scsv, csv, blocksize=10)  # 10 bytes per message

            assert discover(scsv) == discover(csv)
示例#11
0
文件: test_sql.py 项目: blaze/blaze
def sql():
    data = [(1, 2), (10, 20), (100, 200)]
    sql = bz_data(
        'sqlite:///:memory:::foo',
        dshape='var * {x: int, y: int}',
    )
    into(sql, data)
    return sql
示例#12
0
def test_csv_to_s3_into():
    df = tm.makeMixedDataFrame()
    with tmpfile('.csv') as fn:
        with s3_bucket('.csv') as b:
            df.to_csv(fn, index=False)
            s3 = into(b, CSV(fn))
            result = into(pd.DataFrame, s3)
    tm.assert_frame_equal(df, result)
示例#13
0
def test_csv_to_s3__using_multipart_upload():
    df = pd.DataFrame({'a': ["*" * 5 * 1024 ** 2]})
    with tmpfile('.csv') as fn:
        with s3_bucket('.csv') as b:
            df.to_csv(fn, index=False)
            s3 = into(b, CSV(fn), multipart=True)
            result = into(pd.DataFrame, s3)
    tm.assert_frame_equal(df, result)
示例#14
0
def test_append_other():
    with tmpfile('.hdf5') as fn:
        x = into(np.ndarray, df)
        dset = into('hdfstore://'+fn+'::/data', x)
        try:
            assert discover(dset) == discover(df)
        finally:
            dset.parent.close()
示例#15
0
def test_month():
    dts = [datetime(2000, 7, 1), datetime(2000, 6, 30), datetime(2000, 6, 1), datetime(2000, 5, 31)]
    dts = into(np.ndarray, dts)

    assert eq(
        compute(s.truncate(1, "month"), dts),
        into(np.ndarray, [date(2000, 7, 1), date(2000, 6, 1), date(2000, 6, 1), date(2000, 5, 1)]),
    )
示例#16
0
def test_into_resource():
    with tmpfile('.hdf5') as fn:
        d = into('hdfstore://' + fn + '::/x', df)
        try:
            assert discover(d) == discover(df)
            assert eq(into(pd.DataFrame, d), df)
        finally:
            d.parent.close()
示例#17
0
文件: test_h5py.py 项目: MoherX/odo
def test_varlen_dtypes():
    y = np.array([('Alice', 100), ('Bob', 200)],
                dtype=[('name', 'O'), ('amount', 'i4')])
    with tmpfile('.hdf5') as fn:
        dset = into(fn + '::/data', y)
        try:
            assert into(list, dset) == into(list, dset)
        finally:
            dset.file.close()
示例#18
0
def test_table_resource():
    with tmpfile('csv') as filename:
        ds = dshape('var * {a: int, b: int}')
        csv = CSV(filename)
        append(csv, [[1, 2], [10, 20]], dshape=ds)

        t = Data(filename)
        assert isinstance(t.data, CSV)
        assert into(list, compute(t)) == into(list, csv)
示例#19
0
def eq(a, b):
    if isinstance(a, pd.DataFrame):
        a = into(np.ndarray, a)
    if isinstance(b, pd.DataFrame):
        b = into(np.ndarray, b)
    c = a == b
    if isinstance(c, np.ndarray):
        c = c.all()
    return c
示例#20
0
文件: test_json.py 项目: MoherX/odo
def test_datetimes():
    from odo import into
    import numpy as np
    data = [{'a': 1, 'dt': datetime.datetime(2001, 1, 1)},
            {'a': 2, 'dt': datetime.datetime(2002, 2, 2)}]
    with tmpfile('json') as fn:
        j = JSONLines(fn)
        append(j, data)

        assert str(into(np.ndarray, j)) == str(into(np.ndarray, data))
示例#21
0
文件: test_csv.py 项目: shoyer/odo
def test_pandas_csv_naive_behavior_results_in_columns():
    df = pd.DataFrame(
        [[1, "Alice", 100], [2, "Bob", -200], [3, "Charlie", 300], [4, "Denis", 400], [5, "Edith", -500]],
        columns=["id", "name", "amount"],
    )
    with tmpfile(".csv") as fn:
        into(fn, df)

        with open(fn) as f:
            assert next(f).strip() == "id,name,amount"
示例#22
0
文件: test_csv.py 项目: ahasha/odo
def test_temp_csv():
    csv = into(Temp(CSV)('_test_temp_csv.csv'), df)
    assert isinstance(csv, CSV)

    assert into(list, csv) == into(list, df)

    del csv
    import gc
    gc.collect()
    assert not os.path.exists('_test_temp_csv.csv')
示例#23
0
文件: test_ssh.py 项目: Curezhang/odo
def test_ssh_csv_to_s3_csv():
    # for some reason this can only be run in the same file as other ssh tests
    # and must be a Temp(SSH(CSV)) otherwise tests above this one fail
    s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket

    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost')
        with s3_bucket('.csv') as b:
            result = into(b, remote)
            assert discover(result) == discover(resource(b))
示例#24
0
def test_data_on_iterator_refies_data():
    data = [1, 2, 3]
    d = Data(iter(data))

    assert into(list, d) == data
    assert into(list, d) == data

    # in context
    with Data(iter(data)) as d:
        assert d is not None
示例#25
0
def test_into_sqlite():
    data = [('Alice', 100), ('Bob', 200)]
    ds = datashape.dshape('var * {name: string, amount: int}')

    with tmpfile('.db') as dbpath:
        with tmpfile('.csv') as csvpath:
            csv = into(csvpath, data, dshape=ds, has_header=False)
            sql = resource('sqlite:///%s::mytable' % dbpath, dshape=ds)
            with ignoring(NotImplementedError):
                append_csv_to_sql_table(sql, csv)
                assert into(list, sql) == data
示例#26
0
def test_into_sqlite_with_header_and_different_sep():
    df = pd.DataFrame([('Alice', 100), ('Bob', 200)],
                      columns=['name', 'amount'])
    with tmpfile('.csv') as fn:
        csv = into(fn, df, delimiter='|')

        with tmpfile('.db') as sql:
            db = resource('sqlite:///%s::df' % sql, dshape=discover(csv))
            result = into(db, csv)

            assert into(list, result) == into(list, df)
示例#27
0
def test_hdfs_directory_hive_creation():
    with accounts_data() as (hdfs_directory, (a, b, c)):
        with hive_table(host) as uri:
            t = into(uri, hdfs_directory)
            assert isinstance(t, sa.Table)
            result = into(set, t)
            assert result > 0
            assert discover(t) == ds

            t2 = into(uri, c)  # append new singleton file
            assert len(into(list, t2)) > len(result)
示例#28
0
文件: test_csv.py 项目: ahasha/odo
def test_pandas_csv_naive_behavior_results_in_columns():
    df = pd.DataFrame([[1, 'Alice',   100],
                       [2, 'Bob',    -200],
                       [3, 'Charlie', 300],
                       [4, 'Denis',   400],
                       [5, 'Edith',  -500]], columns=['id', 'name', 'amount'])
    with tmpfile('.csv') as fn:
        into(fn, df)

        with open(fn) as f:
            assert next(f).strip() == 'id,name,amount'
示例#29
0
def test_hour():
    dts = [datetime(2000, 6, 20,  1, 00, 00),
           datetime(2000, 6, 20, 12, 59, 59),
           datetime(2000, 6, 20, 12, 00, 00),
           datetime(2000, 6, 20, 11, 59, 59)]
    dts = into(np.ndarray, dts)

    assert eq(compute(s.truncate(1, 'hour'), dts),
            into(np.ndarray, [datetime(2000, 6, 20,  1, 0),
                              datetime(2000, 6, 20, 12, 0),
                              datetime(2000, 6, 20, 12, 0),
                              datetime(2000, 6, 20, 11, 0)]))
示例#30
0
def test_append_to_array():
    x = np.arange(600).reshape((20, 30))
    a = into(Array, x, blockshape=(4, 5))
    b = bcolz.zeros(shape=(0, 30), dtype=x.dtype)

    append(b, a)
    assert eq(b[:], x)

    with tmpfile('hdf5') as fn:
        h = into(fn+'::/data', a)
        assert eq(h[:], x)
        h.file.close()
示例#31
0
def test_scalar_sql_compute():
    t = into('sqlite:///:memory:::t',
             tdata,
             dshape=dshape('var * {name: string, amount: int}'))
    d = data(t)
    assert expr_repr(d.amount.sum()) == '300'
示例#32
0
文件: hdfstore.py 项目: bopopescu/QC
def pre_compute(expr, data, **kwargs):
    return into(chunks(pd.DataFrame), data, **kwargs)
示例#33
0
def test_copy_hdfs_files_locally():
    with tmpfile('csv') as target:
        with accounts_data() as (d, (a, b, c)):
            csv = into(target, a)
            with open(csv.path) as f:
                assert f.read().strip() == accounts_1_csv
示例#34
0
def test_into_resource():
    with tmpfile('.hdf5') as fn:
        d = into('hdfstore://' + fn + '::/x', df)
        assert discover(d) == discover(df)
        assert eq(into(pd.DataFrame, d), df)
        d.parent.close()
示例#35
0
文件: hdfstore.py 项目: bopopescu/QC
def pre_compute(expr, data, **kwargs):
    return into(pd.DataFrame, data, **kwargs)
示例#36
0
def test_tryexcept_into(csv, sql):
    sql, bind = sql
    with pytest.raises(psycopg2.NotSupportedError):
        # uses multi-byte character
        into(sql, csv, quotechar="alpha", bind=bind)
示例#37
0
def test_outer_join():
    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    with tmpfile('db') as fn:
        uri = 'sqlite:///' + fn
        engine = resource(uri)

        _left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)]

        left = resource(uri, 'left', dshape=L.dshape)
        into(left, _left)

        _right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)]
        right = resource(uri, 'right', dshape=R.dshape)
        into(right, _right)

        conn = engine.connect()

        query = compute(join(L, R, how='inner'), {
            L: left,
            R: right
        },
                        post_compute=False)
        result = list(map(tuple, conn.execute(query).fetchall()))

        assert set(result) == set([(1, 'Alice', 100, 'NYC'),
                                   (1, 'Alice', 100, 'Boston'),
                                   (4, 'Dennis', 400, 'Moscow')])

        query = compute(join(L, R, how='left'), {
            L: left,
            R: right
        },
                        post_compute=False)
        result = list(map(tuple, conn.execute(query).fetchall()))

        assert set(result) == set([(1, 'Alice', 100, 'NYC'),
                                   (1, 'Alice', 100, 'Boston'),
                                   (2, 'Bob', 200, None),
                                   (4, 'Dennis', 400, 'Moscow')])

        query = compute(join(L, R, how='right'), {
            L: left,
            R: right
        },
                        post_compute=False)
        print(query)
        result = list(map(tuple, conn.execute(query).fetchall()))
        print(result)

        assert set(result) == set([(1, 'Alice', 100, 'NYC'),
                                   (1, 'Alice', 100, 'Boston'),
                                   (3, None, None, 'LA'),
                                   (4, 'Dennis', 400, 'Moscow')])

        # SQLAlchemy doesn't support full outer join
        """
        query = compute(join(L, R, how='outer'),
                        {L: left, R: right},
                        post_compute=False)
        result = list(map(tuple, conn.execute(query).fetchall()))

        assert set(result) == set(
                [(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (2, 'Bob', 200, None),
                 (3, None, None, 'LA'),
                 (4, 'Dennis', 400, 'Moscow')])
        """

        conn.close()
示例#38
0
def test_into_np_ndarray_column():
    t = data(L, fields=['id', 'name', 'balance'])
    expr = t[t.balance < 0].name
    colarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(colarray)
示例#39
0
文件: chunks.py 项目: mhlr/blaze
def compute_down(expr, data, **kwargs):
    leaf = expr._leaves()[0]
    if all(isinstance(e, Cheap) for e in path(expr, leaf)):
        return compute(expr, {leaf: into(Iterator, data)}, **kwargs)
    else:
        raise MDNotImplementedError()
示例#40
0
def test_sequence():
    b = into(Bag, [1, 2, 3])
    assert set(b.map(inc)) == set([2, 3, 4])
示例#41
0
    assert expr_repr(expr) == 'data[data.balance > a].name'


def test_isidentical_regr():
    # regression test for #1387
    tdata = np.array([(np.nan, ), (np.nan, )], dtype=[('a', 'float64')])
    ds = data(tdata)
    assert ds.a.isidentical(ds.a)


@pytest.mark.parametrize(
    'data,dshape,exp_type',
    [
        (1, symbol('x', 'int').dshape, int),
        # test 1-d to series
        (into(da.core.Array, [1, 2],
              chunks=(10, )), dshape('2 * int'), pd.Series),
        # test 2-d tabular to dataframe
        (into(da.core.Array, [{
            'a': 1,
            'b': 2
        }, {
            'a': 3,
            'b': 4
        }],
              chunks=(10, 10)), dshape('2 * {a: int, b: int}'), pd.DataFrame),
        # test 2-d non tabular to ndarray
        (into(da.core.Array, [[1, 2], [3, 4]],
              chunks=(10, 10)), dshape('2 *  2 * int'), np.ndarray)
    ])
def test_coerce_core(data, dshape, exp_type):
    assert isinstance(coerce_core(data, dshape), exp_type)
示例#42
0
def test_into():
    assert into(list, t) == into(list, tdata)
示例#43
0
def compute_up(t, x, **kwargs):
    ds = t._child.dshape
    if x.ndim > 1 or isinstance(x, np.recarray) or x.dtype.fields is not None:
        return compute_up(t, into(DataFrame, x, dshape=ds), **kwargs)
    else:
        return compute_up(t, into(Series, x, dshape=ds), **kwargs)
示例#44
0
from datashape.predicates import isscalar, iscollection, isrecord
from blaze.expr import symbol, by
from blaze.interactive import Data
from blaze.compute import compute
from blaze.expr.functions import sin, exp

sources = []

t = symbol('t', 'var * {amount: int64, id: int64, name: string}')

L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [-400, 4, 'Dan'],
     [500, 5, 'Edith']]

df = DataFrame(L, columns=['amount', 'id', 'name'])

x = into(np.ndarray, df)

sources = [df, x]

try:
    import sqlalchemy
    sql = resource('sqlite:///:memory:::accounts', dshape=t.dshape)
    into(sql, L)
    sources.append(sql)
except:
    sql = None

try:
    import bcolz
    bc = into(bcolz.ctable, df)
    sources.append(bc)
示例#45
0
def compute_up(expr, data, **kwargs):
    return Series(compute_up(expr, into(np.ndarray, data), **kwargs),
                  name=expr._name)
示例#46
0
def df_eq(a, b):
    return (list(a.columns) == list(b.columns)
            # and list(a.dtypes) == list(b.dtypes)
            and into(set, into(list, a)) == into(set, into(list, b)))
示例#47
0
def test_simple_into(csv, sql):
    sql, bind = sql
    into(sql, csv, dshape=discover(sql), bind=bind)
    assert into(list, sql, bind=bind) == data
示例#48
0
def test_into_curry():
    assert callable(into(list))
    data = (1, 2, 3)
    assert into(list)(data) == odo(data, list)
示例#49
0
def test_no_header_no_columns(csv, sql):
    sql, bind = sql
    into(sql, csv, bind=bind, dshape=discover(sql))
    assert into(list, sql, bind=bind) == data
示例#50
0
def test_into_invalid_dshape(dshape):
    with pytest.raises(TypeError):
        into(list, (1, 2, 3), dshape=dshape)
示例#51
0
def test_convert_local_file_to_temp_ssh_file():
    with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn:
        csv = CSV(fn)
        scsv = convert(Temp(SSH(CSV)), csv, hostname='localhost')

        assert into(list, csv) == into(list, scsv)
示例#52
0
def test_by():
    expr = by(t.amount > 0, count=t.id.count())
    result = compute(expr, x)
    assert set(map(tuple, into(list, result))) == set([(False, 2), (True, 3)])
示例#53
0
def test_append_other():
    with tmpfile('.hdf5') as fn:
        x = into(np.ndarray, df)
        dset = into('hdfstore://' + fn + '::/data', x)
        assert discover(dset) == discover(df)
        dset.parent.close()
示例#54
0
def test_append_object_to_HDFS_foo():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    with tmpfile_hdfs('json') as fn:
        js = into('hdfs://%s:%s' % (host, fn), df, hdfs=hdfs)
        assert (into(np.ndarray, js) == into(np.ndarray, df)).all()
示例#55
0
def test_copy_hdfs_data_into_memory():
    with accounts_data() as (d, (a, b, c)):
        assert into(list, a)
示例#56
0
def test_into_nd_array_selection():
    t = data(L, fields=['id', 'name', 'balance'])
    expr = t[t['balance'] < 0]
    selarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(selarray)
示例#57
0
文件: test_csv.py 项目: user32000/odo
def test_csv_into_list():
    with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn:
        L = into(list, fn)
        assert L == [('Alice', 100), ('Bob', 200)]
示例#58
0
def test_computation_directly_on_sqlalchemy_Tables(data):
    name = data['name']
    s = symbol('s', discover(name))
    result = into(list, compute(s.id + 1, name))
    assert not isinstance(result, sa.sql.Selectable)
    assert list(result) == []
示例#59
0
def test_into_respects_expected_len_during_append():
    with tmpfile('.bcolz') as fn:
        b = into(fn, [1, 2, 3])
        assert get_expectedlen(b) == 3
        assert len(b) == 3
        shutil.rmtree(fn)
示例#60
0
def test_into_nd_array_column_failure():
    tble = data(L, fields=['id', 'name', 'balance'])
    expr = tble[tble['balance'] < 0]
    colarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(colarray)