def plot(self, output_file="termite.html"): t = blz.Data(self.input_file) df = pd.read_csv(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) #p.xaxis().major_label_orientation = np.pi/3 logging.info("generating termite plot for file %s" % self.input_file) plt.show(p)
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return into(object, head) elif isrecord(expr.dshape.measure): return into(DataFrame, head) else: df = into(DataFrame, head) df.columns = [expr._name] return df result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return into(DataFrame, result, dshape=expr.dshape) else: df = into(DataFrame, result, dshape=expr.dshape) df.columns = [expr._name] return df
def test_simple_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_no_header_no_columns(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_complex_into(complex_csv, complex_sql): complex_sql, bind = complex_sql # data from: http://dummydata.me/generate into(complex_sql, complex_csv, dshape=discover(complex_sql), bind=bind) assert_allclose( into(list, complex_sql, bind=bind), into(list, complex_csv) )
def plot(self, output_file="termite.html"): import blaze as blz from odo import into import pandas as pd import bokeh.plotting as plt from bokeh.models.sources import ColumnDataSource t = blz.Data(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) plt.show(p)
def test_csv_with_header(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = into('sqlite:///%s::mytable' % dbfilename, csvfilename, has_header=True) assert discover(t) == dshape('var * {a: int64, b: int64}') assert into(set, t) == set([(1, 2), (3, 4)])
def sql(): data = [(1, 2), (10, 20), (100, 200)] sql = resource( 'sqlite:///:memory:', 'foo', dshape='var * {x: int, y: int}', ) into(sql, data) return sql
def test_s3_to_ssh(): pytest.importorskip('boto') tips_uri = 's3://nyqpug/tips.csv' with tmpfile('.csv') as fn: result = into(Temp(SSH(CSV))(fn, hostname='localhost'), tips_uri) assert into(list, result) == into(list, tips_uri) assert discover(result) == discover(resource(tips_uri))
def test_copy_local_files_to_hdfs(): with tmpfile_hdfs() as target: with filetext('name,amount\nAlice,100\nBob,200') as source: csv = CSV(source) scsv = HDFS(CSV)(target, hdfs=hdfs) into(scsv, csv, blocksize=10) # 10 bytes per message assert discover(scsv) == discover(csv)
def sql(): data = [(1, 2), (10, 20), (100, 200)] sql = bz_data( 'sqlite:///:memory:::foo', dshape='var * {x: int, y: int}', ) into(sql, data) return sql
def test_csv_to_s3_into(): df = tm.makeMixedDataFrame() with tmpfile('.csv') as fn: with s3_bucket('.csv') as b: df.to_csv(fn, index=False) s3 = into(b, CSV(fn)) result = into(pd.DataFrame, s3) tm.assert_frame_equal(df, result)
def test_csv_to_s3__using_multipart_upload(): df = pd.DataFrame({'a': ["*" * 5 * 1024 ** 2]}) with tmpfile('.csv') as fn: with s3_bucket('.csv') as b: df.to_csv(fn, index=False) s3 = into(b, CSV(fn), multipart=True) result = into(pd.DataFrame, s3) tm.assert_frame_equal(df, result)
def test_append_other(): with tmpfile('.hdf5') as fn: x = into(np.ndarray, df) dset = into('hdfstore://'+fn+'::/data', x) try: assert discover(dset) == discover(df) finally: dset.parent.close()
def test_month(): dts = [datetime(2000, 7, 1), datetime(2000, 6, 30), datetime(2000, 6, 1), datetime(2000, 5, 31)] dts = into(np.ndarray, dts) assert eq( compute(s.truncate(1, "month"), dts), into(np.ndarray, [date(2000, 7, 1), date(2000, 6, 1), date(2000, 6, 1), date(2000, 5, 1)]), )
def test_into_resource(): with tmpfile('.hdf5') as fn: d = into('hdfstore://' + fn + '::/x', df) try: assert discover(d) == discover(df) assert eq(into(pd.DataFrame, d), df) finally: d.parent.close()
def test_varlen_dtypes(): y = np.array([('Alice', 100), ('Bob', 200)], dtype=[('name', 'O'), ('amount', 'i4')]) with tmpfile('.hdf5') as fn: dset = into(fn + '::/data', y) try: assert into(list, dset) == into(list, dset) finally: dset.file.close()
def test_table_resource(): with tmpfile('csv') as filename: ds = dshape('var * {a: int, b: int}') csv = CSV(filename) append(csv, [[1, 2], [10, 20]], dshape=ds) t = Data(filename) assert isinstance(t.data, CSV) assert into(list, compute(t)) == into(list, csv)
def eq(a, b): if isinstance(a, pd.DataFrame): a = into(np.ndarray, a) if isinstance(b, pd.DataFrame): b = into(np.ndarray, b) c = a == b if isinstance(c, np.ndarray): c = c.all() return c
def test_datetimes(): from odo import into import numpy as np data = [{'a': 1, 'dt': datetime.datetime(2001, 1, 1)}, {'a': 2, 'dt': datetime.datetime(2002, 2, 2)}] with tmpfile('json') as fn: j = JSONLines(fn) append(j, data) assert str(into(np.ndarray, j)) == str(into(np.ndarray, data))
def test_pandas_csv_naive_behavior_results_in_columns(): df = pd.DataFrame( [[1, "Alice", 100], [2, "Bob", -200], [3, "Charlie", 300], [4, "Denis", 400], [5, "Edith", -500]], columns=["id", "name", "amount"], ) with tmpfile(".csv") as fn: into(fn, df) with open(fn) as f: assert next(f).strip() == "id,name,amount"
def test_temp_csv(): csv = into(Temp(CSV)('_test_temp_csv.csv'), df) assert isinstance(csv, CSV) assert into(list, csv) == into(list, df) del csv import gc gc.collect() assert not os.path.exists('_test_temp_csv.csv')
def test_ssh_csv_to_s3_csv(): # for some reason this can only be run in the same file as other ssh tests # and must be a Temp(SSH(CSV)) otherwise tests above this one fail s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost') with s3_bucket('.csv') as b: result = into(b, remote) assert discover(result) == discover(resource(b))
def test_data_on_iterator_refies_data(): data = [1, 2, 3] d = Data(iter(data)) assert into(list, d) == data assert into(list, d) == data # in context with Data(iter(data)) as d: assert d is not None
def test_into_sqlite(): data = [('Alice', 100), ('Bob', 200)] ds = datashape.dshape('var * {name: string, amount: int}') with tmpfile('.db') as dbpath: with tmpfile('.csv') as csvpath: csv = into(csvpath, data, dshape=ds, has_header=False) sql = resource('sqlite:///%s::mytable' % dbpath, dshape=ds) with ignoring(NotImplementedError): append_csv_to_sql_table(sql, csv) assert into(list, sql) == data
def test_into_sqlite_with_header_and_different_sep(): df = pd.DataFrame([('Alice', 100), ('Bob', 200)], columns=['name', 'amount']) with tmpfile('.csv') as fn: csv = into(fn, df, delimiter='|') with tmpfile('.db') as sql: db = resource('sqlite:///%s::df' % sql, dshape=discover(csv)) result = into(db, csv) assert into(list, result) == into(list, df)
def test_hdfs_directory_hive_creation(): with accounts_data() as (hdfs_directory, (a, b, c)): with hive_table(host) as uri: t = into(uri, hdfs_directory) assert isinstance(t, sa.Table) result = into(set, t) assert result > 0 assert discover(t) == ds t2 = into(uri, c) # append new singleton file assert len(into(list, t2)) > len(result)
def test_pandas_csv_naive_behavior_results_in_columns(): df = pd.DataFrame([[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) with tmpfile('.csv') as fn: into(fn, df) with open(fn) as f: assert next(f).strip() == 'id,name,amount'
def test_hour(): dts = [datetime(2000, 6, 20, 1, 00, 00), datetime(2000, 6, 20, 12, 59, 59), datetime(2000, 6, 20, 12, 00, 00), datetime(2000, 6, 20, 11, 59, 59)] dts = into(np.ndarray, dts) assert eq(compute(s.truncate(1, 'hour'), dts), into(np.ndarray, [datetime(2000, 6, 20, 1, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 11, 0)]))
def test_append_to_array(): x = np.arange(600).reshape((20, 30)) a = into(Array, x, blockshape=(4, 5)) b = bcolz.zeros(shape=(0, 30), dtype=x.dtype) append(b, a) assert eq(b[:], x) with tmpfile('hdf5') as fn: h = into(fn+'::/data', a) assert eq(h[:], x) h.file.close()
def test_scalar_sql_compute(): t = into('sqlite:///:memory:::t', tdata, dshape=dshape('var * {name: string, amount: int}')) d = data(t) assert expr_repr(d.amount.sum()) == '300'
def pre_compute(expr, data, **kwargs): return into(chunks(pd.DataFrame), data, **kwargs)
def test_copy_hdfs_files_locally(): with tmpfile('csv') as target: with accounts_data() as (d, (a, b, c)): csv = into(target, a) with open(csv.path) as f: assert f.read().strip() == accounts_1_csv
def test_into_resource(): with tmpfile('.hdf5') as fn: d = into('hdfstore://' + fn + '::/x', df) assert discover(d) == discover(df) assert eq(into(pd.DataFrame, d), df) d.parent.close()
def pre_compute(expr, data, **kwargs): return into(pd.DataFrame, data, **kwargs)
def test_tryexcept_into(csv, sql): sql, bind = sql with pytest.raises(psycopg2.NotSupportedError): # uses multi-byte character into(sql, csv, quotechar="alpha", bind=bind)
def test_outer_join(): L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') with tmpfile('db') as fn: uri = 'sqlite:///' + fn engine = resource(uri) _left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = resource(uri, 'left', dshape=L.dshape) into(left, _left) _right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = resource(uri, 'right', dshape=R.dshape) into(right, _right) conn = engine.connect() query = compute(join(L, R, how='inner'), { L: left, R: right }, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='left'), { L: left, R: right }, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='right'), { L: left, R: right }, post_compute=False) print(query) result = list(map(tuple, conn.execute(query).fetchall())) print(result) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) # SQLAlchemy doesn't support full outer join """ query = compute(join(L, R, how='outer'), {L: left, R: right}, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) """ conn.close()
def test_into_np_ndarray_column(): t = data(L, fields=['id', 'name', 'balance']) expr = t[t.balance < 0].name colarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(colarray)
def compute_down(expr, data, **kwargs): leaf = expr._leaves()[0] if all(isinstance(e, Cheap) for e in path(expr, leaf)): return compute(expr, {leaf: into(Iterator, data)}, **kwargs) else: raise MDNotImplementedError()
def test_sequence(): b = into(Bag, [1, 2, 3]) assert set(b.map(inc)) == set([2, 3, 4])
assert expr_repr(expr) == 'data[data.balance > a].name' def test_isidentical_regr(): # regression test for #1387 tdata = np.array([(np.nan, ), (np.nan, )], dtype=[('a', 'float64')]) ds = data(tdata) assert ds.a.isidentical(ds.a) @pytest.mark.parametrize( 'data,dshape,exp_type', [ (1, symbol('x', 'int').dshape, int), # test 1-d to series (into(da.core.Array, [1, 2], chunks=(10, )), dshape('2 * int'), pd.Series), # test 2-d tabular to dataframe (into(da.core.Array, [{ 'a': 1, 'b': 2 }, { 'a': 3, 'b': 4 }], chunks=(10, 10)), dshape('2 * {a: int, b: int}'), pd.DataFrame), # test 2-d non tabular to ndarray (into(da.core.Array, [[1, 2], [3, 4]], chunks=(10, 10)), dshape('2 * 2 * int'), np.ndarray) ]) def test_coerce_core(data, dshape, exp_type): assert isinstance(coerce_core(data, dshape), exp_type)
def test_into(): assert into(list, t) == into(list, tdata)
def compute_up(t, x, **kwargs): ds = t._child.dshape if x.ndim > 1 or isinstance(x, np.recarray) or x.dtype.fields is not None: return compute_up(t, into(DataFrame, x, dshape=ds), **kwargs) else: return compute_up(t, into(Series, x, dshape=ds), **kwargs)
from datashape.predicates import isscalar, iscollection, isrecord from blaze.expr import symbol, by from blaze.interactive import Data from blaze.compute import compute from blaze.expr.functions import sin, exp sources = [] t = symbol('t', 'var * {amount: int64, id: int64, name: string}') L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [-400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) x = into(np.ndarray, df) sources = [df, x] try: import sqlalchemy sql = resource('sqlite:///:memory:::accounts', dshape=t.dshape) into(sql, L) sources.append(sql) except: sql = None try: import bcolz bc = into(bcolz.ctable, df) sources.append(bc)
def compute_up(expr, data, **kwargs): return Series(compute_up(expr, into(np.ndarray, data), **kwargs), name=expr._name)
def df_eq(a, b): return (list(a.columns) == list(b.columns) # and list(a.dtypes) == list(b.dtypes) and into(set, into(list, a)) == into(set, into(list, b)))
def test_simple_into(csv, sql): sql, bind = sql into(sql, csv, dshape=discover(sql), bind=bind) assert into(list, sql, bind=bind) == data
def test_into_curry(): assert callable(into(list)) data = (1, 2, 3) assert into(list)(data) == odo(data, list)
def test_no_header_no_columns(csv, sql): sql, bind = sql into(sql, csv, bind=bind, dshape=discover(sql)) assert into(list, sql, bind=bind) == data
def test_into_invalid_dshape(dshape): with pytest.raises(TypeError): into(list, (1, 2, 3), dshape=dshape)
def test_convert_local_file_to_temp_ssh_file(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = convert(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv)
def test_by(): expr = by(t.amount > 0, count=t.id.count()) result = compute(expr, x) assert set(map(tuple, into(list, result))) == set([(False, 2), (True, 3)])
def test_append_other(): with tmpfile('.hdf5') as fn: x = into(np.ndarray, df) dset = into('hdfstore://' + fn + '::/data', x) assert discover(dset) == discover(df) dset.parent.close()
def test_append_object_to_HDFS_foo(): df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) with tmpfile_hdfs('json') as fn: js = into('hdfs://%s:%s' % (host, fn), df, hdfs=hdfs) assert (into(np.ndarray, js) == into(np.ndarray, df)).all()
def test_copy_hdfs_data_into_memory(): with accounts_data() as (d, (a, b, c)): assert into(list, a)
def test_into_nd_array_selection(): t = data(L, fields=['id', 'name', 'balance']) expr = t[t['balance'] < 0] selarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(selarray)
def test_csv_into_list(): with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: L = into(list, fn) assert L == [('Alice', 100), ('Bob', 200)]
def test_computation_directly_on_sqlalchemy_Tables(data): name = data['name'] s = symbol('s', discover(name)) result = into(list, compute(s.id + 1, name)) assert not isinstance(result, sa.sql.Selectable) assert list(result) == []
def test_into_respects_expected_len_during_append(): with tmpfile('.bcolz') as fn: b = into(fn, [1, 2, 3]) assert get_expectedlen(b) == 3 assert len(b) == 3 shutil.rmtree(fn)
def test_into_nd_array_column_failure(): tble = data(L, fields=['id', 'name', 'balance']) expr = tble[tble['balance'] < 0] colarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(colarray)