def test_es_if_exists_replace(self): # Assert that 'replace' allows for creation df1 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(pd_df2, df1) # Assert that 'replace' will replace existing mapping and entries df2 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ) assert_pandas_eland_frame_equal(pd_df, df2) df3 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(df1, df3)
def test_es_if_exists_append_mapping_mismatch(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) with pytest.raises(ValueError) as e: pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) assert str(e.value) == ( "DataFrame dtypes and Elasticsearch index mapping aren't compatible:\n" "- 'b' is missing from DataFrame columns\n" "- 'c' is missing from DataFrame columns\n" "- 'd' is missing from DataFrame columns\n" "- 'Z' is missing from ES index mapping\n" "- 'a' column type ('keyword') not compatible with ES index mapping type ('long')" ) # Assert that the index isn't modified assert_pandas_eland_frame_equal(pd_df, df1)
def test_head_0(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head_0 = ed_flights.head(0) pd_head_0 = pd_flights.head(0) assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
def test_es_if_exists_append_es_type_coerce_error(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, es_type_overrides={"a": "byte"}, ) assert_pandas_eland_frame_equal(pd_df, df1) pd_df_short = pd.DataFrame( { "a": [128], # This value is too large for 'byte' "b": [-1.0], "c": ["A"], "d": [dt], }, index=["3"], ) with pytest.raises(BulkIndexError) as e: pandas_to_eland( pd_df_short, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) # Assert that the value 128 caused the index error assert "Value [128] is out of range for a byte" in str(e.value)
def test_generate_es_mappings(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], }, index=["0", "1", "2"], ) expected_mappings = { "mappings": { "properties": { "A": { "type": "double" }, "B": { "type": "long" }, "C": { "type": "keyword" }, "D": { "type": "date" }, "E": { "type": "double" }, "F": { "type": "boolean" }, "G": { "type": "long" }, } } } mappings = FieldMappings._generate_es_mappings(df) assert expected_mappings == mappings # Now create index index_name = "eland_test_generate_es_mappings" ed_df = ed.pandas_to_eland(df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) ES_TEST_CLIENT.indices.delete(index=index_name)
def test_flights_filter_columns_like(self, like): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() ed_df = ed_flights_small.filter(like=like) pd_df = pd_flights_small.filter(like=like) assert_pandas_eland_frame_equal(pd_df, ed_df)
def test_flights_filter_columns_items(self, items): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() ed_df = ed_flights_small.filter(items=items) pd_df = pd_flights_small.filter(items=items) assert_pandas_eland_frame_equal(pd_df, ed_df)
def test_getitem_attribute_list(self): ed_flights = self.ed_flights().head(42) pd_flights = self.pd_flights().head(42) ed_flights_slice = ed_flights[["OriginAirportID", "AvgTicketPrice", "Carrier"]] pd_flights_slice = pd_flights[["OriginAirportID", "AvgTicketPrice", "Carrier"]] assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
def test_flights_select_dtypes(self): pd_flights = self.pd_flights_small() ed_flights = self.ed_flights_small() assert_pandas_eland_frame_equal( pd_flights.select_dtypes(include=np.number), ed_flights.select_dtypes(include=np.number), )
def test_flights_filter_index_items(self, items): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() ed_df = ed_flights_small.filter(items=items, axis=0) pd_df = pd_flights_small.filter(items=items, axis=0) assert_pandas_eland_frame_equal(pd_df, ed_df)
def test_flights_filter_columns_regex(self, regex): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() ed_df = ed_flights_small.filter(regex=regex) pd_df = pd_flights_small.filter(regex=regex) assert_pandas_eland_frame_equal(pd_df, ed_df)
def test_isna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) isna_ed_ecommerce = ed_ecommerce[ ed_ecommerce["geoip.region_name"].isna()] isna_pd_ecommerce = pd_ecommerce[ pd_ecommerce["geoip.region_name"].isna()] assert_pandas_eland_frame_equal(isna_pd_ecommerce, isna_ed_ecommerce)
def test_notna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) for column in self.columns: notna_ed_ecommerce = ed_ecommerce[ed_ecommerce[column].notna()] notna_pd_ecommerce = pd_ecommerce[pd_ecommerce[column].notna()] assert_pandas_eland_frame_equal(notna_pd_ecommerce, notna_ed_ecommerce)
def test_select_dtypes_exclude_number(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number]) pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number]) assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
def test_getitem_query(self): # Examples from: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html pd_df = pd.DataFrame( { "A": range(1, 6), "B": range(10, 0, -2), "C": range(10, 5, -1) }, index=["0", "1", "2", "3", "4"], ) """ >>> pd_df A B C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 """ # Now create index index_name = "eland_test_query" ed_df = ed.pandas_to_eland(pd_df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) pd_df.info() ed_df.info() pd_q1 = pd_df[pd_df.A > 2] pd_q2 = pd_df[pd_df.A > pd_df.B] pd_q3 = pd_df[pd_df.B == pd_df.C] ed_q1 = ed_df[ed_df.A > 2] ed_q2 = ed_df[ed_df.A > ed_df.B] ed_q3 = ed_df[ed_df.B == ed_df.C] assert_pandas_eland_frame_equal(pd_q1, ed_q1) assert_pandas_eland_frame_equal(pd_q2, ed_q2) assert_pandas_eland_frame_equal(pd_q3, ed_q3) pd_q4 = pd_df[(pd_df.A > 2) & (pd_df.B > 3)] ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)] assert_pandas_eland_frame_equal(pd_q4, ed_q4) ES_TEST_CLIENT.indices.delete(index_name)
def test_flights_drop_all_columns(self): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() all_columns = ed_flights_small.columns pd_col0 = pd_flights_small.drop(labels=all_columns, axis=1) pd_col1 = pd_flights_small.drop(columns=all_columns) ed_col0 = ed_flights_small.drop(labels=all_columns, axis=1) ed_col1 = ed_flights_small.drop(columns=all_columns) assert_pandas_eland_frame_equal(pd_col0, ed_col0) assert_pandas_eland_frame_equal(pd_col1, ed_col1) assert ed_col0.columns.equals(pd_col0.columns) assert ed_col1.columns.equals(pd_col1.columns)
def test_flights_small_drop(self): ed_flights_small = self.ed_flights_small() pd_flights_small = self.pd_flights_small() # ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', # 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion', # 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay', # 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour', # 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName', # 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', # 'dayOfWeek', 'timestamp'] pd_col0 = pd_flights_small.drop(["Carrier", "DestCityName"], axis=1) pd_col1 = pd_flights_small.drop(columns=["Carrier", "DestCityName"]) ed_col0 = ed_flights_small.drop(["Carrier", "DestCityName"], axis=1) ed_col1 = ed_flights_small.drop(columns=["Carrier", "DestCityName"]) assert_pandas_eland_frame_equal(pd_col0, ed_col0) assert_pandas_eland_frame_equal(pd_col1, ed_col1) # Drop rows by index pd_idx0 = pd_flights_small.drop(["1", "2"]) ed_idx0 = ed_flights_small.drop(["1", "2"]) assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
def test_es_if_exists_append(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, # We use 'short' here specifically so that the # assumed type of 'long' is coerced into a 'short' # by append mode. es_type_overrides={"a": "short"}, ) assert_pandas_eland_frame_equal(pd_df, df1) assert df1.shape == (3, 4) pd_df2 = pd.DataFrame( { "a": [4, 5, 6], "b": [-1.0, -2.0, -3.0], "c": ["A", "B", "C"], "d": [dt, dt - timedelta(1), dt - timedelta(2)], }, index=["3", "4", "5"], ) df2 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) # Assert that the second pandas dataframe is actually appended assert df2.shape == (6, 4) pd_df3 = pd_df.append(pd_df2) assert_pandas_eland_frame_equal(pd_df3, df2)
def test_tail_head(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_tail_10 = ed_flights.tail(10) pd_tail_10 = pd_flights.tail(10) assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10) ed_head_8 = ed_tail_10.head(8) pd_head_8 = pd_tail_10.head(8) assert_pandas_eland_frame_equal(pd_head_8, ed_head_8) ed_tail_5 = ed_head_8.tail(5) pd_tail_5 = pd_head_8.tail(5) assert_pandas_eland_frame_equal(pd_tail_5, ed_tail_5) ed_head_4 = ed_tail_5.head(4) pd_head_4 = pd_tail_5.head(4) assert_pandas_eland_frame_equal(pd_head_4, ed_head_4)
def test_head(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head_10 = ed_flights.head(10) pd_head_10 = pd_flights.head(10) assert_pandas_eland_frame_equal(pd_head_10, ed_head_10) ed_head_8 = ed_head_10.head(8) pd_head_8 = pd_head_10.head(8) assert_pandas_eland_frame_equal(pd_head_8, ed_head_8) ed_head_20 = ed_head_10.head(20) pd_head_20 = pd_head_10.head(20) assert_pandas_eland_frame_equal(pd_head_20, ed_head_20)
def test_tail(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_tail_10 = ed_flights.tail(10) pd_tail_10 = pd_flights.tail(10) assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10) ed_tail_8 = ed_tail_10.tail(8) pd_tail_8 = pd_tail_10.tail(8) assert_pandas_eland_frame_equal(pd_tail_8, ed_tail_8) ed_tail_20 = ed_tail_10.tail(20) pd_tail_20 = pd_tail_10.tail(20) assert_pandas_eland_frame_equal(pd_tail_20, ed_tail_20)