예제 #1
0
def harmonize_fdsn_dframe(query_df, query_type):
    """harmonizes the query dataframe (convert to dataframe dtypes, removes NaNs etcetera) according
    to query_type
    :param query_df: a query dataframe *on which `rename_columns` has already been called*
    :param query_type: either 'event', 'channel', 'station'
    :return: a new dataframe with only the good values
    """
    if empty(query_df):
        return empty()

    if query_type.lower() in ("event", "events"):
        fdsn_model_classes = [models.Event]
    elif query_type.lower() in ("station", "stations"):
        fdsn_model_classes = [models.Station]
    elif query_type.lower() in ("channel", "channels"):
        fdsn_model_classes = [models.Station, models.Channel]

    # convert columns to correct dtypes (datetime, numeric etcetera). Values not conforming
    # will be set to NaN or NaT or None, thus detectable via pandas.dropna or pandas.isnull
    for fdsn_model_class in fdsn_model_classes:
        query_df = harmonize_columns(fdsn_model_class, query_df)
        # we might have NA values (NaNs) after harmonize_columns, now
        # drop the rows with NA rows (NA for columns which are non-nullable):
        query_df = harmonize_rows(fdsn_model_class, query_df)

    return query_df
예제 #2
0
    def test_harmonize_columns(self):

        id = 'abcdefghilmnopq'
        utcnow = datetime.datetime.utcnow()

        eventcolnames = list(colnames(models.Event))
        df = pd.DataFrame(columns=eventcolnames,
                          data=[[None for _ in eventcolnames]])


        # add a column which is NOT on the table:
        colx = 'iassdvgdhrnjynhnt_________'
        df.insert(0, colx, 1)

        cnames, df2 = _harmonize_columns(models.Event, df)

        # colx is not part of the Event model:
        assert colx not in cnames

        # df2 has been modified in place actually:
        assert (df.dtypes == df2.dtypes).all()

        df2types = df2.dtypes
        # checking if a class is datetime is cumbersome innumpy. See here:
        # http://stackoverflow.com/questions/23063362/consistent-way-to-check-if-an-np-array-is-datetime-like
        # so we do:
        assert 'datetime64' in str(df2types[models.Event.time.key])
        # other stuff works fine with normal check:
        assert df2types[models.Event.latitude.key] == np.float64
        assert df2types[models.Event.longitude.key] == np.float64
        assert df2types[models.Event.depth_km.key] == np.float64
        assert df2types[models.Event.magnitude.key] == np.float64
        # assert also other fields are objects (not all of them, just two):
        assert df2types[models.Event.event_location_name.key] == object
        assert df2types[models.Event.author.key] == object
        
        
        df3 = harmonize_columns(models.Event, df2)[cnames] # this calls _harmonize_columns above
        
        assert colx not in df3.columns
        
        
        
        
        
        # now try to see with invalid values for floats
        evcolnames = list(colnames(models.Event))
        dfx = pd.DataFrame(columns=evcolnames,
                          data=[["a" for _ in evcolnames]])
        
        _harmonize_columns(models.Event, dfx)
        
        # df2 and dfx should have the same dtypes:
        assert (dfx.dtypes == df2[cnames].dtypes).all()
        
        # fast check: datetimes and a float field
        assert pd.isnull(dfx.loc[0, models.Event.time.key])
        assert pd.isnull(dfx.loc[0, models.Event.longitude.key])
        
        # check harmonize rows: invalid rows should be removed (we have 1 invalid row)
        oldlen = len(dfx)
        dfrows = harmonize_rows(models.Event, dfx, inplace=False)
        assert len(dfrows) ==0 and len(dfx) == oldlen
        # check inplace = True
        dfrows = harmonize_rows(models.Event, dfx, inplace=True)
        assert len(dfrows) == len(dfx) == 0
        
        # go on by checking harmonize_columns. FIXME: what are we doing here below?
        dfx = pd.DataFrame(columns=evcolnames,
                          data=[["a" for _ in evcolnames]])
        
        dfx.loc[0, models.Event.time.key] = utcnow
        dfx.loc[0, models.Event.latitude.key] = 6.5
        
        _harmonize_columns(models.Event, dfx)
        # fast check: datetimes and a float field
        assert pd.notnull(dfx.loc[0, models.Event.time.key])
        assert pd.isnull(dfx.loc[0, models.Event.longitude.key])
        assert pd.notnull(dfx.loc[0, models.Event.latitude.key])
        
        
        
        
        
        g = 9