def test_pandas_discover_on_gzipped_files(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02', open=gzip.open, extension='.csv.gz') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: string, when: datetime}') assert discover(csv) == ds
def test_csv(): with tmpfile('.csv') as fn: csv = CSV(fn, dshape='var * {name: string, amount: int}', delimiter=',') assert csv.dialect['delimiter'] == ','
def test_pandas_read_supports_read_csv_kwargs(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_DataFrame(csv, dshape=ds, usecols=['name']) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', ), ('Bob', )]
def test_temp_ssh_files(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert discover(csv) == discover(scsv) assert isinstance(scsv, _Temp)
def test_simple_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_no_header_no_columns(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_read_supports_missing_integers(): with filetext('Alice,1\nBob,') as fn: ds = datashape.dshape('var * {name: string, val: ?int32}') csv = CSV(fn) df = csv_to_DataFrame(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'val'] assert df.dtypes['val'] == 'f4'
def test_pandas_read(): with filetext('Alice,1\nBob,2') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_DataFrame(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_unused_datetime_columns(): ds = datashape.dshape('var * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) assert convert( list, csv_to_DataFrame(csv, usecols=['val'], squeeze=True, dshape=ds)) == ['a', 'b']
def test_pandas_read_supports_datetimes(): with filetext('Alice,2014-01-02\nBob,2014-01-03') as fn: ds = datashape.dshape('var * {name: string, when: date}') csv = CSV(fn) df = csv_to_DataFrame(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ['name', 'when'] assert df.dtypes['when'] == 'M8[ns]'
def test_pandas_writes_header_by_default(): with tmpfile('.csv') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn) append(csv, data, dshape=ds) with open(fn) as f: assert 'name' in f.read()
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
def test_table_resource(): with tmpfile('csv') as filename: ds = dshape('var * {a: int, b: int}') csv = CSV(filename) append(csv, [[1, 2], [10, 20]], dshape=ds) t = Data(filename) assert isinstance(t.data, CSV) assert into(list, compute(t)) == into(list, csv)
def test_append(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert into(list, sql) == data into(sql, csv) assert into(list, sql) == data + data
def test_header_disagrees_with_dshape(): ds = datashape.dshape('var * {name: string, bal: int64}') with filetext('name,val\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn, header=True) assert convert(list, csv) == [('Alice', 100), ('Bob', 200)] assert list(convert(pd.DataFrame, csv).columns) == ['name', 'val'] assert list(convert(pd.DataFrame, csv, dshape=ds).columns) == ['name', 'bal']
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == \ [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_pandas_read_supports_gzip(): with filetext('Alice,1\nBob,2', open=gzip.open, mode='wt', extension='.csv.gz') as fn: ds = datashape.dshape('var * {name: string, amount: int}') csv = CSV(fn) df = csv_to_DataFrame(csv, dshape=ds) assert isinstance(df, pd.DataFrame) assert convert(list, df) == [('Alice', 1), ('Bob', 2)] assert list(df.columns) == ['name', 'amount']
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_pandas_write_gzip(): with tmpfile('.csv.gz') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn, has_header=True) append(csv, data, dshape=ds) f = gzip.open(fn) s = f.read() assert 'name' in s assert 'Alice,1' in s f.close()
def test_convert_through_temporary_local_storage(): with filetext('name,quantity\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) df = into(pd.DataFrame, csv) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv) scsv2 = into(Temp(SSH(CSV)), df, hostname='localhost') assert into(list, scsv2) == into(list, df) sjson = into(Temp(SSH(JSONLines)), df, hostname='localhost') assert (into(np.ndarray, sjson) == into(np.ndarray, df)).all()
def test_csv_append(): with tmpfile('.csv') as fn: csv = CSV(fn, has_header=False) data = [('Alice', 100), ('Bob', 200)] append(csv, data) assert list(convert(Iterator, csv)) == data with open(fn) as f: s = f.read() assert 'Alice' in s assert '100' in s
def test_pandas_write(): with tmpfile('.csv') as fn: ds = datashape.dshape('var * {name: string, amount: int}') data = [('Alice', 1), ('Bob', 2)] csv = CSV(fn, has_header=True) append(csv, data, dshape=ds) with open(fn) as f: assert 'name' in f.read() # Doesn't write header twice append(csv, data, dshape=ds) with open(fn) as f: s = f.read() assert s.count('name') == 1
def test_complex_into(tbl): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') ds = dshape(""" var * { Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64 }""") csv = CSV(file_name, has_header=True) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert_allclose(into(list, sql), into(list, csv))
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV( file_name, schema= '{Name: string, RegistrationDate: date, ZipCode: int64, Consts: float64}' ) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert into(list, sql) == into(list, csv)
def test_csv_postgres_load(): tbl = 'testtable' engine = sqlalchemy.create_engine(url) if engine.has_table(tbl): metadata = sqlalchemy.MetaData() metadata.reflect(engine) t = metadata.tables[tbl] t.drop(engine) csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) engine = sql.bind conn = engine.raw_connection() cursor = conn.cursor() full_path = os.path.abspath(file_name) load = '''LOAD DATA INFILE '{0}' INTO TABLE {1} FIELDS TERMINATED BY ',' lines terminated by '\n' '''.format(full_path, tbl) cursor.execute(load) conn.commit()
def test_Data_attribute_repr(): t = Data(CSV(example('accounts-datetimes.csv'))) result = t.when.day expected = pd.DataFrame({'when_day': [1,2,3,4,5]}) assert repr(result) == repr(expected)
def test_unicode_column_names(): with filetext('foo\xc4\x87,a\n1,2\n3,4', extension='csv') as fn: csv = CSV(fn, has_header=True) df = into(pd.DataFrame, csv)