def harmonize_fdsn_dframe(query_df, query_type): """harmonizes the query dataframe (convert to dataframe dtypes, removes NaNs etcetera) according to query_type :param query_df: a query dataframe *on which `rename_columns` has already been called* :param query_type: either 'event', 'channel', 'station' :return: a new dataframe with only the good values """ if empty(query_df): return empty() if query_type.lower() in ("event", "events"): fdsn_model_classes = [models.Event] elif query_type.lower() in ("station", "stations"): fdsn_model_classes = [models.Station] elif query_type.lower() in ("channel", "channels"): fdsn_model_classes = [models.Station, models.Channel] # convert columns to correct dtypes (datetime, numeric etcetera). Values not conforming # will be set to NaN or NaT or None, thus detectable via pandas.dropna or pandas.isnull for fdsn_model_class in fdsn_model_classes: query_df = harmonize_columns(fdsn_model_class, query_df) # we might have NA values (NaNs) after harmonize_columns, now # drop the rows with NA rows (NA for columns which are non-nullable): query_df = harmonize_rows(fdsn_model_class, query_df) return query_df
def test_harmonize_columns(self): id = 'abcdefghilmnopq' utcnow = datetime.datetime.utcnow() eventcolnames = list(colnames(models.Event)) df = pd.DataFrame(columns=eventcolnames, data=[[None for _ in eventcolnames]]) # add a column which is NOT on the table: colx = 'iassdvgdhrnjynhnt_________' df.insert(0, colx, 1) cnames, df2 = _harmonize_columns(models.Event, df) # colx is not part of the Event model: assert colx not in cnames # df2 has been modified in place actually: assert (df.dtypes == df2.dtypes).all() df2types = df2.dtypes # checking if a class is datetime is cumbersome innumpy. See here: # http://stackoverflow.com/questions/23063362/consistent-way-to-check-if-an-np-array-is-datetime-like # so we do: assert 'datetime64' in str(df2types[models.Event.time.key]) # other stuff works fine with normal check: assert df2types[models.Event.latitude.key] == np.float64 assert df2types[models.Event.longitude.key] == np.float64 assert df2types[models.Event.depth_km.key] == np.float64 assert df2types[models.Event.magnitude.key] == np.float64 # assert also other fields are objects (not all of them, just two): assert df2types[models.Event.event_location_name.key] == object assert df2types[models.Event.author.key] == object df3 = harmonize_columns(models.Event, df2)[cnames] # this calls _harmonize_columns above assert colx not in df3.columns # now try to see with invalid values for floats evcolnames = list(colnames(models.Event)) dfx = pd.DataFrame(columns=evcolnames, data=[["a" for _ in evcolnames]]) _harmonize_columns(models.Event, dfx) # df2 and dfx should have the same dtypes: assert (dfx.dtypes == df2[cnames].dtypes).all() # fast check: datetimes and a float field assert pd.isnull(dfx.loc[0, models.Event.time.key]) assert pd.isnull(dfx.loc[0, models.Event.longitude.key]) # check harmonize rows: invalid rows should be removed (we have 1 invalid row) oldlen = len(dfx) dfrows = harmonize_rows(models.Event, dfx, inplace=False) assert len(dfrows) ==0 and len(dfx) == oldlen # check inplace = True dfrows = harmonize_rows(models.Event, dfx, inplace=True) assert len(dfrows) == len(dfx) == 0 # go on by checking harmonize_columns. FIXME: what are we doing here below? dfx = pd.DataFrame(columns=evcolnames, data=[["a" for _ in evcolnames]]) dfx.loc[0, models.Event.time.key] = utcnow dfx.loc[0, models.Event.latitude.key] = 6.5 _harmonize_columns(models.Event, dfx) # fast check: datetimes and a float field assert pd.notnull(dfx.loc[0, models.Event.time.key]) assert pd.isnull(dfx.loc[0, models.Event.longitude.key]) assert pd.notnull(dfx.loc[0, models.Event.latitude.key]) g = 9