def test_csv_to_compressed_csv(): with tmpfile('.csv') as fn: with open(fn, 'w') as f: f.write('a,1\nb,2\nc,3') with tmpfile('.csv.gz') as gfn: result = odo(fn, gfn) assert odo(result, list) == odo(fn, list)
def into(input, output, name, identifier): """ :param input: :param output: :param name: :param identifier: :return: """ with tempfile.TemporaryDirectory() as directory: source = os.path.join(directory, os.path.basename(input)) with open(input, "r") as fin, open(source, "w") as fout: reader = csv.reader(fin) writer = csv.writer(fout) headers = next(reader) headers = [__format__(name, header) for header in headers] headers = ["TableNumber"] + headers writer.writerow(headers) [writer.writerow([identifier] + row) for row in reader] odo.odo(source, "{}::{}".format(output, name), has_header=True, delimiter=",")
def test_nan_to_nat(): assert odo(float('nan'), pd.Timestamp) is pd.NaT assert odo(np.nan, pd.Timestamp) is pd.NaT with pytest.raises(NetworkXNoPath): # Check that only nan can be converted. odo(0.5, pd.Timestamp)
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( odo(compute(transformed.dist.max(), nyc), float) == odo(compute(transformed.dist, nyc), pd.Series).max().item() )
def test_shift_arithmetic(sql, n): t = symbol('t', discover(sql)) expr = t.B - t.B.shift(n) result = odo(compute(expr, sql), pd.Series) df = odo(sql, pd.DataFrame) expected = df.B - df.B.shift(n) tm.assert_series_equal(result, expected)
def test_strlen(ctx, db): expr = db.t.name.strlen() result = odo(compute(expr, ctx), pd.Series) expected = compute(expr, {db: {'t': df}}) assert result.name == 'name' assert expected.name == 'name' assert odo(result, set) == odo(expected, set)
def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = odo(logs, Bag) assert isinstance(b, Bag) assert 'a1.log' in str(b.dask.values()) assert odo(b, list) == odo(logs, list)
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return odo(head, object) elif isrecord(expr.dshape.measure): return odo(head, DataFrame) else: df = odo(head, DataFrame) df.columns = [expr._name] return df result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return odo(result, DataFrame, dshape=expr.dshape) else: df = odo(result, DataFrame, dshape=expr.dshape) df.columns = [expr._name] return df
def get_materialized_checkpoints(checkpoints, colnames, lower_dt, odo_kwargs): """ Computes a lower bound and a DataFrame checkpoints. Parameters ---------- checkpoints : Expr Bound blaze expression for a checkpoints table from which to get a computed lower bound. colnames : iterable of str The names of the columns for which checkpoints should be computed. lower_dt : pd.Timestamp The lower date being queried for that serves as an upper bound for checkpoints. odo_kwargs : dict, optional The extra keyword arguments to pass to ``odo``. """ if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs ) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None return lower, materialized_checkpoints
def test_seed(dataset): munge_file = pkg_resources.resource_filename(__name__, "../perturbation/scripts/munge.sh") if dataset["munge"]: subprocess.call([munge_file, dataset["data_dir"]]) config_file = os.path.join(dataset["data_dir"], "config.ini") config = configparser.ConfigParser() config.read(config_file) with tempfile.TemporaryDirectory() as temp_dir: sqlite_file = os.path.join(temp_dir, "test.db") perturbation.ingest.seed(config=config, source=dataset["data_dir"], target="sqlite:///{}".format(str(sqlite_file))) for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items(): config["filenames"][k] = v for table_key in ["image", "cells", "cytoplasm", "nuclei"]: csv_filename = os.path.join(temp_dir, config["filenames"][table_key]) table_name = config["filenames"][table_key].split(".")[0] odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename) df = pd.read_csv(csv_filename) assert df.shape[0] == dataset["ingest"]["{}_nrows".format(table_name)] assert df.shape[1] == dataset["ingest"]["{}_ncols".format(table_name)] + 1 if table_key != "image": assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == dataset["ingest"]["{}_nrows".format(table_name)]
def create_plot(team="LAA", year=2012): expr = bz.by(db.Salaries.teamID, avg=db.Salaries.salary.mean(), max=db.Salaries.salary.max(), ratio=db.Salaries.salary.max() / db.Salaries.salary.min()) expr = expr.sort('ratio', ascending=False) df_salary_gb = odo(expr, pd.DataFrame) source1 = odo(df_salary_gb[["teamID", "avg"]], ColumnDataSource) plot1 = plt.figure(title="Salary ratio by team", x_range=list(df_salary_gb["teamID"])) plot1.scatter(x="teamID", y="avg", source=source1, size=20) plot1.xaxis.major_label_orientation = np.pi/3 df = odo(db.Salaries, pd.DataFrame) df = df[df["teamID"] == team] df = df[df["yearID"] == year] df = df[["playerID","salary"]].sort('salary') source_team = odo(df, ColumnDataSource) p_team = plt.figure(title="Salary of players for %s during %s" % (team, year), x_range=list(df["playerID"]))#, tools=TOOLS) p_team.scatter(x="playerID", y="salary", source=source_team, size=20) p_team.xaxis.major_label_orientation = np.pi/3 p = plt.gridplot([[plot1, p_team]]) return p
def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = odo(logs, Bag) assert isinstance(b, Bag) assert (list(map(methodcaller('strip'), odo(b, list))) == list(map(methodcaller('strip'), odo(logs, list))))
def test_append_chunks(): tbl = resource("sqlite:///:memory:::test", dshape="var * {a: int, b: int}") res = odo(chunks(np.ndarray)((np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]))), tbl) assert res is tbl assert ( odo(tbl, np.ndarray) == np.array([(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[("a", "<i4"), ("b", "<i4")]) ).all()
def test_df_to_in_memory_db(): df = pd.DataFrame([[1, 2], [3, 4]], columns=list('ab')) tbl = odo(df, 'sqlite:///:memory:::tbl') pd.util.testing.assert_frame_equal( odo(tbl, pd.DataFrame), df, )
def handle(self, *args, **options): # set up config = get_config() if config is None: raise CommandError('Unable to process configuration file p_to_p.yml') connection = get_connection(config) pedsnet_session = init_pedsnet(connection) init_pcornet(connection) observation_period = pedsnet_session.query(ObservationPeriod.person_id, ObservationPeriod.observation_period_start_date, ObservationPeriod.observation_period_end_date, ObservationPeriod.site, bindparam("chart", 'Y'), bindparam("enr_basis", 'E') ).filter( exists().where(ObservationPeriod.person_id == PersonVisit.person_id)).all() odo(observation_period, Enrollment.__table__, dshape='var * {patid: string, enr_start_date: date, enr_end_date: date, site: string, chart: String, ' 'enr_basis: String} ' ) # close session pedsnet_session.close() # ouutput result self.stdout.ending = '' print('Enrollment ETL completed successfully', end='', file=self.stdout)
def test_str_len(ctx, db): expr = db.t.name.str.len() result = odo(compute(expr, ctx, return_type='native'), pd.Series) expected = compute(expr, {db: {'t': df}}, return_type='native') assert result.name == 'name' assert expected.name == 'name' assert odo(result, set) == odo(expected, set)
def test_na_value(sql, csv): sql = odo(null_data, sql) with tmpfile('.csv') as fn: csv = odo(sql, fn, na_value='NA') with open(csv.path, 'rt') as f: raw = f.read() assert raw == 'a,b\n1,NA\n10,20\n100,200\n'
def test_compute_kwargs(test, serial): expr = t.dumb.sort() bad_query = {'expr': to_tree(expr)} result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(bad_query), ) assert result.status_code == 500 assert b'return_df must be passed' in result.data good_query = { 'expr': to_tree(expr), 'compute_kwargs': { 'return_df': odo(DumbResource.df, list), }, } result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(good_query) ) assert result.status_code == 200 data = serial.loads(result.data) dshape = discover(DumbResource.df) assert_dshape_equal( datashape.dshape(data['datashape']), dshape, ) assert_frame_equal( odo(data['data'], DataFrame, dshape=dshape), DumbResource.df, )
def test_sql_to_csv(sql, csv, tmpdir): sql, bind = sql sql = odo(csv, sql, bind=bind) with tmpfile('.csv', dir=tmpdir) as fn: csv = odo(sql, fn, bind=bind) assert odo(csv, list) == data assert discover(csv).measure.names == discover(sql).measure.names
def merge_temp_files(dbname, image_names=None, do_odo=False): if do_odo: logging.info('Merging temp files with odo.') else: logging.info('Merging temp files manually.') if image_names is None: image_names = get_image_names(dbname) dbname_base, ext = os.path.splitext(dbname) dbnamenew = dbname_base + '_cleaned' + ext logging.info('Creating concatenated db file {}'.format(dbnamenew)) if not do_odo: df = [] for image_name in image_names: try: if do_odo: odo('hdfstore://{}::df'.format(get_temp_fname(image_name)), 'hdfstore://{}::df'.format(dbnamenew)) else: df.append(pd.read_hdf(get_temp_fname(image_name), 'df')) except OSError: continue else: os.remove(get_temp_fname(image_name)) df = pd.concat(df, ignore_index=True) df.to_hdf(dbnamenew, 'df', format='table', data_columns=data_columns) logging.info('Duplicates removal complete.') return dbnamenew
def each_partition(values): for fpath in values: print('Processing: ', fpath) reader = FACC1Reader(fpath) for fname, df in reader: output_path = get_output_path(fpath, fname) odo(df, output_path)
def test_csv_to_compressed_csv(): with tmpfile(".csv") as fn: with open(fn, "w") as f: f.write("a,1\nb,2\nc,3") with tmpfile(".csv.gz") as gfn: result = odo(fn, gfn) assert odo(result, list) == odo(fn, list)
def test_math(ctx, db, func): expr = func(db.t.amount) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) np.testing.assert_allclose(np.sort(odo(result, np.ndarray, dshape=expr.dshape)), np.sort(odo(expected, np.ndarray)))
def test_sql_select_to_csv(sql, csv, tmpdir): sql, bind = sql sql = odo(csv, sql, bind=bind) query = sa.select([sql.c.a]) with tmpfile('.csv', dir=tmpdir) as fn: csv = odo(query, fn, bind=bind) assert odo(csv, list) == [(x,) for x, _ in data]
def test_invalid_escapechar(sql, csv): sql, bind = sql with pytest.raises(ValueError): odo(csv, sql, escapechar='12', bind=bind) with pytest.raises(ValueError): odo(csv, sql, escapechar='', bind=bind)
def handle(self, *args, **options): # set up config = get_config() if config is None: raise CommandError('Unable to process configuration file p_to_p.yml') connection = get_connection(config) pedsnet_session = init_pedsnet(connection) init_pcornet(connection) for df in pd.read_sql(pedsnet_session.query(VisitOccurrence.person_id, VisitOccurrence.visit_occurrence_id.label('visit_id')) \ .filter(extract('year', VisitOccurrence.visit_start_date) >= 2001).statement, pedsnet_session.bind, chunksize=50000): odo(df, PersonVisit.__table__, dshape='var * {person_id: int, visit_id: int}' ) # close session pedsnet_session.close() # ouutput result self.stdout.ending = '' print('Person Visit ETL completed successfully', end='', file=self.stdout)
def join_dataframe_to_selectable(expr, lhs, rhs, scope=None, **kwargs): lexpr, rexpr = expr._leaves() return compute( expr, {lexpr: odo(lhs, pd.DataFrame, dshape=lexpr.dshape), rexpr: odo(rhs, pd.DataFrame, dshape=rexpr.dshape)}, **kwargs )
def demographic_etl(config): # set up connection = get_connection(config) pedsnet_session = init_pedsnet(connection) init_pcornet(connection) # multiple aliases for pedsnet_pcornet_valueset_map # to allow the three named joins gender_value_map = aliased(ValueSetMap) ethnicity_value_map = aliased(ValueSetMap) race_value_map = aliased(ValueSetMap) # extract the data from the person table person = pedsnet_session.query(Person.person_id, Person.birth_date, Person.birth_time, coalesce(gender_value_map.target_concept, 'OT'), coalesce(ethnicity_value_map.target_concept, 'OT'), coalesce(race_value_map.target_concept, 'OT'), bindparam("biobank_flag", "N"), Person.gender_source_value, Person.ethnicity_source_value, Person.race_source_value, Person.site, bindparam("gender_identity", None), bindparam("raw_gender_identity", None), bindparam("sexual_orientation", None), bindparam("raw_sexual_orientation", None) ). \ outerjoin(gender_value_map, and_(gender_value_map.source_concept_class == 'Gender', case([(and_(Person.gender_concept_id == None, gender_value_map.source_concept_id == None), True)], else_=cast(Person.gender_concept_id, String(200)) == gender_value_map.source_concept_id))). \ outerjoin(ethnicity_value_map, and_(ethnicity_value_map.source_concept_class == 'Hispanic', case([(and_(Person.ethnicity_concept_id == None, ethnicity_value_map.source_concept_id == None), True)], else_=cast(Person.ethnicity_concept_id, String(200)) == ethnicity_value_map.source_concept_id))). \ outerjoin(race_value_map, and_(race_value_map.source_concept_class == 'Race', case([(and_(Person.race_concept_id == None, race_value_map.source_concept_id == None), True)], else_=cast(Person.race_concept_id, String(200)) == race_value_map.source_concept_id))).all() # transform data to pcornet names and types # load to demographic table odo(person, Demographic.__table__, dshape='var * {patid: string, birth_date: date, birth_time: string, sex: string,' 'hispanic: string, race: string, biobank_flag: string, raw_sex: string,' 'raw_hispanic: string, raw_race:string, site: string, gender_identity: string,' 'raw_gender_identity: string, sexual_orientation: string, raw_sexual_orientation: string}' ) # close session pedsnet_session.close()
def test_sample(sql): t = symbol('t', discover(sql)) result = compute(t.sample(n=1), sql) s = odo(result, pd.DataFrame) assert len(s) == 1 result2 = compute(t.sample(frac=0.5), sql) s2 = odo(result2, pd.DataFrame) assert len(s) == len(s2)
def test_from_dataframe_strings(sql_with_strings): sql_with_strings, bind = sql_with_strings input_ = pd.DataFrame([['ayy', 'hello "world"'], ['lmao', None]], columns=['non_optional', 'optional']) odo(input_, sql_with_strings, bind=bind) output = odo(sql_with_strings, pd.DataFrame, bind=bind) pd.util.testing.assert_frame_equal(output, input_)
def convert_base(typ, x): x = compute(x) try: return typ(x) except: return typ(odo(x, typ))
def tosqlite (self): odo(self.df, 'sqlite:///cme.db::ticktable')
# import a.csv pd.read_sql_table('imported', sql_engine) # import b.csv => fix with d6tstack # pandas to sql df.to_sql('a', sql_engine, if_exists='replace', index=False) pd.read_sql_table('a', sql_engine) df.to_sql('a', sql_engine, if_exists='append', index=False) pd.read_sql_table('a', sql_engine) # dask NO to sql # odo import to db import odo odo.odo('data/s3-201806/a.csv', 'mysql+mysqlconnector://augvest:augvest@localhost/augvest::imported') pd.read_sql_table('imported', sql_engine) # excel? convert to csv using d6stack #**************************************** # regular updates #**************************************** # csv, txt, excel # option 1: rerun with more data # option 2: incremental run # database dfts1 = pd.DataFrame({ 'date': pd.date_range('2018-01-01', periods=5),
def dumb_to_df(d, return_df=None, **kwargs): if return_df is None: raise DumbResource.NoResource('return_df must be passed') to_return = odo(return_df, DataFrame, dshape=discover(d)) assert_frame_equal(to_return, DumbResource.df) return to_return
def test_convert_bag_to_list(): assert odo(b, list) == L
from odo import odo import pymongo import pandas as pd # This is a fix for fields previouslySmoking and currentlySmoking being exclusive. # From the UKBB, if you are currently smoking, it assumes that you were not previously smoking. This script # populates 'previouslySmoking' with 1, if 'currentlySmoking' is 1, and adjusts the -perDay numbers as well. client = pymongo.MongoClient('localhost', 27017) db = client.ukbb collection = db['ahriCleaner2'] cursor = collection.find() df = pd.DataFrame(list(cursor)) df.drop(inplace=True, columns=["_id"]) df.loc[df['currentlySmoking'] == 1, ['previouslySmoked']] = 1 df['noOfCigarettesPreviouslyPerDay'] = df.apply( lambda row: row['noOfCigarettesPerDay'] if row['currentlySmoking'] == 1 else row['noOfCigarettesPreviouslyPerDay'], axis=1) odo(df, db.ahriSmokingFix)
def gl_data(sql_two_tables): u_data, t_data = sql_two_tables # populate the tables with some data and return it return data(odo([(1, )], u_data)), data(odo([(2, )], t_data))
def test_sequence(): b = odo([1, 2, 3], Bag) assert set(b.map(inc)) == set([2, 3, 4])
def test_least(gl_data): u, t = gl_data assert odo(least(u.a.max(), t.a.max()), int) == 1
def test_greatest(gl_data): u, t = gl_data assert odo(greatest(u.a.max(), t.a.max()), int) == 2
def test_postgres_create(sql): assert odo(sql, list) == [('a', 1), ('b', 2)]
def test_postgres_isnan(sql_with_float): data = (1.0, ), (float('nan'), ) table = odo(data, sql_with_float) sym = symbol('s', discover(data)) assert odo(compute(sym.isnan(), table), list) == [(False, ), (True, )]
def test_shift_on_column(n, column, sql): t = symbol('t', discover(sql)) expr = t[column].shift(n) result = odo(compute(expr, sql), pd.Series) expected = odo(sql, pd.DataFrame)[column].shift(n) tm.assert_series_equal(result, expected)
names=[r[0] for r in village_schema]) village_df.to_csv('villagecd.csv', index=False) # csv => postgresql import odo import sqlalchemy engine = sqlalchemy.create_engine('postgresql://jimmy@localhost:5432/jimmy') conn = engine.connect() history_cols = [ "{} character varying({})".format(r[0], r[2]) for r in history_schema ] history_ct = "create table qvf_det_history ({})".format( ", ".join(history_cols)) conn.execute(history_ct) odo.odo('./detroit_history.csv', 'postgresql://jimmy@localhost:5432/jimmy::qvf_det_history') voter_cols = [ "{} character varying({})".format(r[0], r[2]) for r in voter_schema ] voter_ct = "create table qvf_det_voters ({})".format(", ".join(voter_cols)) conn.execute(voter_ct) odo.odo('./detroit_voters.csv', "postgresql://jimmy@localhost:5432/jimmy::qvf_det_voters") county_cols = [ "{} character varying({})".format(r[0], r[2]) for r in county_schema ] county_ct = "create table qvf_county ({})".format(", ".join(county_cols)) conn.execute(county_ct) odo.odo('./countycd.csv',
def test_csv_infer_header(): with tmpfile('db') as dbfilename: with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename: t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename) assert discover(t) == dshape('var * {a: int64, b: int64}') assert odo(t, set) == set([(1, 2), (3, 4)])
def test_quoted_name(csv, quoted_sql): with tmpfile('csv') as filename: csv = odo(data, filename, dshape=ds, has_header=True) s = odo(csv, quoted_sql) t = odo(csv, list) assert sorted(odo(s, list)) == sorted(t)
def test_nunique_spark_dataframe(ctx, db): assert (odo(compute(db.t.nunique(), ctx), int) == ctx.table('t').distinct().count())
def sync_from_csv_to_sqlite(csv_path, sqlite_uri): dshape = discover(resource(csv_path)) odo(csv_path, sqlite_uri, dshape=dshape)
def csv(): with tmpfile('csv') as filename: yield odo(data, filename, dshape=ds, has_header=False)
def test_ugly_schema(csv, sql_with_ugly_schema): sql_with_ugly_schema, bind = sql_with_ugly_schema assert (odo(odo(csv, sql_with_ugly_schema, bind=bind), list, bind=bind) == data)
def _store(self, df): #self.schema.conform_df(df, storage_target=self.storage_target_type, skip_sort=True) odo.odo(df, self.odo_target) #, dshape=schema_to_dshape(self.schema))
def test_by_with_date(ctx, db, attr): expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean()) result = odo(compute(expr, ctx), set) expected = odo(compute(expr, {db: {'dates': date_df}}), set) assert result == expected
def test_datetime_to_timestamp(): dt = datetime(2014, 1, 1) ts = odo(dt, pd.Timestamp) assert isinstance(ts, pd.Timestamp) assert ts == pd.Timestamp('2014-01-01')
def test_quoted_name(quoted_sql, csv): s = odo(csv, quoted_sql) t = odo(csv, list) assert sorted(odo(s, list)) == sorted(t)
def test_isin(ctx, db, keys): expr = db.t[db.t.id.isin(keys)] result = odo(compute(expr, ctx), set) expected = odo(compute(expr, {db: {'t': df}}), set) assert (set(map(frozenset, odo(result, list))) == set(map(frozenset, odo(expected, list))))