def test_pd_to_sql(self): dc = models.DataCenter(station_query_url='awergedfbvdbfnhfsnsbstndggf ', dataselect_query_url='edf') self.session.add(dc) self.session.commit() id = 'abcdefghilmnopq' utcnow = datetime.datetime.utcnow() e = models.Station(id="a.b", network='a', datacenter_id=dc.id, station='b', latitude=56, longitude=78) self.session.add(e) self.session.commit() stacolnames = list(colnames(models.Station)) df = pd.DataFrame(columns=stacolnames, data=[[None for _ in stacolnames]]) df.loc[0, 'id'] = id + '.j' df.loc[0, 'network'] = id df.loc[0, 'datacenter_id'] = dc.id df.loc[0, 'station'] = 'j' df.loc[0, 'latitude'] = 43 df.loc[0, 'longitude'] = 56.7 # df.loc[0, 'datacenter_id'] = dc.id df.to_sql(e.__table__.name, self.engine, if_exists='append', index=False) # same id as above, but check that data exist (i.e., error) df.loc[0, 'id'] = id with pytest.raises(IntegrityError): df.to_sql(e.__table__.name, self.engine, if_exists='append', index=False)
def tst_get_cols(self, seg): clen = len(seg.__class__.__table__.columns) cols = seg.__table__.columns c = list(colnames(seg.__class__)) # or models.Segment assert len(c) == clen c = list(colnames(seg.__class__, pkey=False)) assert len(c) == clen - 1 c = list(colnames(seg.__class__, pkey=True)) assert len(c) == 1 c = list(colnames(seg.__class__, fkey=False)) assert len(c) == clen - 4 c = list(colnames(seg.__class__, fkey=True)) assert len(c) == 4 c = list(colnames(seg.__class__, nullable=True)) assert len(c) == 0 c = list(colnames(seg.__class__, nullable=False)) assert len(c) == clen
def rename_columns(query_df, query_type): """Renames the columns of `query_df` according to the "standard" expected column names given by query_type, so that IO operation with the database are not suffering naming mismatch (e.g., non matching cases). If the number of columns of `query_df` does not match the number of expected columns, a ValueError is raised. The assumption is that any datacenter returns the *same* column in the *same* position, as guessing columns by name might be tricky (there is not only a problem of case sensitivity, but also of e.g. "#Network" vs "network". <-Ref needed!) :param query_df: the DataFrame resulting from an fdsn query, either events station (level=station) or station (level=channel) :param query_type: a string denoting the query type whereby `query_df` has been generated and determining the expected column names, so that `query_df` columns will be renamed accordingly. Possible values are "event", "station" (for a station query with parameter level=station) or "channel" (for a station query with parameter level=channel) :return: a new DataFrame with columns correctly renamed """ if empty(query_df): return query_df Event, Station, Channel = models.Event, models.Station, models.Channel if query_type.lower() == "event" or query_type.lower() == "events": columns = list(colnames(Event)) elif query_type.lower() == "station" or query_type.lower() == "stations": # these are the query_df columns for a station (level=station) query: # #Network|Station|Latitude|Longitude|Elevation|SiteName|StartTime|EndTime # set this table columns mapping (by name, so we can safely add any new column at any # index): columns = [Station.network.key, Station.station.key, Station.latitude.key, Station.longitude.key, Station.elevation.key, Station.site_name.key, Station.start_time.key, Station.end_time.key] elif query_type.lower() == "channel" or query_type.lower() == "channels": # these are the query_df expected columns for a station (level=channel) query: # #Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip| # SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime # Some of them are for the Channel table, so select them: columns = [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key, Station.latitude.key, Station.longitude.key, Station.elevation.key, Channel.depth.key, Channel.azimuth.key, Channel.dip.key, Channel.sensor_description.key, Channel.scale.key, Channel.scale_freq.key, Channel.scale_units.key, Channel.sample_rate.key, Station.start_time.key, Station.end_time.key] else: raise ValueError("Invalid fdsn_model: supply Events, Station or Channel class") oldcolumns = query_df.columns.tolist() if len(oldcolumns) != len(columns): raise ValueError("Mismatching number of columns in '%s' query.\nExpected:\n%s\nFound:\n%s" % (query_type.lower(), str(oldcolumns), str(columns))) return query_df.rename(columns={cold: cnew for cold, cnew in zip(oldcolumns, columns)})
def test_harmonize_columns(self): id = 'abcdefghilmnopq' utcnow = datetime.datetime.utcnow() eventcolnames = list(colnames(models.Event)) df = pd.DataFrame(columns=eventcolnames, data=[[None for _ in eventcolnames]]) # add a column which is NOT on the table: colx = 'iassdvgdhrnjynhnt_________' df.insert(0, colx, 1) cnames, df2 = _harmonize_columns(models.Event, df) # colx is not part of the Event model: assert colx not in cnames # df2 has been modified in place actually: assert (df.dtypes == df2.dtypes).all() df2types = df2.dtypes # checking if a class is datetime is cumbersome innumpy. See here: # http://stackoverflow.com/questions/23063362/consistent-way-to-check-if-an-np-array-is-datetime-like # so we do: assert 'datetime64' in str(df2types[models.Event.time.key]) # other stuff works fine with normal check: assert df2types[models.Event.latitude.key] == np.float64 assert df2types[models.Event.longitude.key] == np.float64 assert df2types[models.Event.depth_km.key] == np.float64 assert df2types[models.Event.magnitude.key] == np.float64 # assert also other fields are objects (not all of them, just two): assert df2types[models.Event.event_location_name.key] == object assert df2types[models.Event.author.key] == object df3 = harmonize_columns(models.Event, df2)[cnames] # this calls _harmonize_columns above assert colx not in df3.columns # now try to see with invalid values for floats evcolnames = list(colnames(models.Event)) dfx = pd.DataFrame(columns=evcolnames, data=[["a" for _ in evcolnames]]) _harmonize_columns(models.Event, dfx) # df2 and dfx should have the same dtypes: assert (dfx.dtypes == df2[cnames].dtypes).all() # fast check: datetimes and a float field assert pd.isnull(dfx.loc[0, models.Event.time.key]) assert pd.isnull(dfx.loc[0, models.Event.longitude.key]) # check harmonize rows: invalid rows should be removed (we have 1 invalid row) oldlen = len(dfx) dfrows = harmonize_rows(models.Event, dfx, inplace=False) assert len(dfrows) ==0 and len(dfx) == oldlen # check inplace = True dfrows = harmonize_rows(models.Event, dfx, inplace=True) assert len(dfrows) == len(dfx) == 0 # go on by checking harmonize_columns. FIXME: what are we doing here below? dfx = pd.DataFrame(columns=evcolnames, data=[["a" for _ in evcolnames]]) dfx.loc[0, models.Event.time.key] = utcnow dfx.loc[0, models.Event.latitude.key] = 6.5 _harmonize_columns(models.Event, dfx) # fast check: datetimes and a float field assert pd.notnull(dfx.loc[0, models.Event.time.key]) assert pd.isnull(dfx.loc[0, models.Event.longitude.key]) assert pd.notnull(dfx.loc[0, models.Event.latitude.key]) g = 9
def testSqlAlchemy(self): #run_cols = models.Run.__table__.columns.keys() #run_cols.remove('id') # remove id (auto set in the model) #d = pd.DataFrame(columns=run_cols, data=[[None for _ in run_cols]]) #records = d.to_dict('records') # records(index=False) # record = records[0] # pass a run_id without id and see if it's updated as utcnow: run_row = models.Run() assert run_row.id is None run_row = models.Run(id=None) assert run_row.id is None # test that methods of the base class work: cnames = list(colnames(run_row.__class__)) assert len(cnames) > 0 # test id is auto added: self.session.add_all([run_row]) # self.session.flush() self.session.commit() assert run_row.id is not None # now pass a utcdatetime and see if we keep that value: utcnow = datetime.datetime.utcnow() run_row = models.Run(run_time=utcnow) assert run_row.run_time == utcnow self.session.add_all([run_row]) # self.session.flush() self.session.commit() assert run_row.run_time == utcnow # test column names: # colz = run_row.get_col_names() # colz.remove('id') # the primary key # d = pd.DataFrame(columns=colz, data=[[None for _ in colz]]) # run_row._check_columns(d) # test types. string ints are parsed automatically? YES val = '6' e = models.Class(id=val) assert e.id != int(val) self.session.add(e) self.session.commit() assert e.id == int(val) # test types. string floats are parsed automatically as int? YES if INT # so this is NO: val = '5.2' e = models.Class(id=val) assert e.id != float(val) self.session.add(e) with pytest.raises(IntegrityError): self.session.commit() # necessary after failure? FIXME: check! self.session.rollback() # this is YES: val = '5.0' e = models.Class(id=val) assert e.id != int(float(val)) self.session.add(e) self.session.commit() assert e.id == int(float(val)) # test types. String floats are parsed automatically? YES val = '6.7' e = models.Event(id='abc', time=datetime.datetime.utcnow(), latitude=val, longitude=78, magnitude=56, depth_km=45) assert e.latitude != float(val) self.session.add(e) self.session.commit() assert e.latitude == float(val) # create a datacenter WITHOUT the two fields stations and dataselect dc = models.DataCenter(station_query_url='abc') with pytest.raises(IntegrityError): self.session.add(dc) self.session.flush() self.session.rollback() # now add it properly: dc = models.DataCenter(station_query_url='abc', dataselect_query_url='edf') self.session.add(dc) self.session.flush() # test stations auto id (concat): # first test non-specified non-null fields datacenter_id (should reaise an IntegrityError) e = models.Station(network='abc', station='f') assert e.id == "abc.f" self.session.add(e) # we do not have specified all non-null fields: with pytest.raises(IntegrityError): self.session.commit() self.session.rollback() # now test auto id e = models.Station(network='abc', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56') assert e.id == "abc.f" self.session.add(e) # we do not have specified all non-null fields: self.session.commit() assert e.id == "abc.f" # test unique constraints by changing only network sta = models.Station(network='a', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56') self.session.add(sta) # we do not have specified all non-null fields: self.session.commit() assert sta.id == "a.f" # now re-add it. Unique constraint failed sta = models.Station(network='a', datacenter_id=dc.id, station='f', latitude='189.5', longitude='156') self.session.add(sta) with pytest.raises(IntegrityError): self.session.commit() self.session.rollback() # test stations channels relationship: sta = models.Station(network='ax', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56') # write channels WITHOUT foreign key cha1 = models.Channel(location='l', channel='HHZ', sample_rate=56) cha2 = models.Channel(location='l', channel='HHN', sample_rate=12) # add channels to the stations.channels relationships sta.channels.append(cha1) sta.channels.append(cha2) # Now when adding and commiting we should see channels foreign keys updated according to # sta id: self.session.add(sta) self.session.add(cha1) self.session.add(cha2) self.session.commit() # foreign keys are auto updated!!! TEST IT: assert cha1.station_id == sta.id assert cha2.station_id == sta.id # now test the same with a station read from the database. We don't acutally need # a commit, flush is sufficient sta = self.session.query(models.Station).filter(models.Station.id == 'a.f').first() cha1 = models.Channel(location='l2', channel='HHZ', sample_rate=56) cha2 = models.Channel(location='l', channel='HHW', sample_rate=56) sta.channels.append(cha1) sta.channels.append(cha2) assert cha1.station_id != sta.id assert cha2.station_id != sta.id self.session.flush() # foreign keys are auto updated!!! TEST IT: assert cha1.station_id == sta.id assert cha2.station_id == sta.id k = self.session.query(models.Event).all() assert len(k) == 1