def test_es_if_exists_append_es_type_coerce_error(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, es_type_overrides={"a": "byte"}, ) assert_pandas_eland_frame_equal(pd_df, df1) pd_df_short = pd.DataFrame( { "a": [128], # This value is too large for 'byte' "b": [-1.0], "c": ["A"], "d": [dt], }, index=["3"], ) with pytest.raises(BulkIndexError) as e: pandas_to_eland( pd_df_short, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) # Assert that the value 128 caused the index error assert "Value [128] is out of range for a byte" in str(e.value)
def test_es_if_exists_replace(self): # Assert that 'replace' allows for creation df1 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(pd_df2, df1) # Assert that 'replace' will replace existing mapping and entries df2 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ) assert_pandas_eland_frame_equal(pd_df, df2) df3 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(df1, df3)
def test_head_0(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head_0 = ed_flights.head(0) pd_head_0 = pd_flights.head(0) assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
def test_es_if_exists_append_mapping_mismatch(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) with pytest.raises(ValueError) as e: pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) assert str(e.value) == ( "DataFrame dtypes and Elasticsearch index mapping aren't compatible:\n" "- 'b' is missing from DataFrame columns\n" "- 'c' is missing from DataFrame columns\n" "- 'd' is missing from DataFrame columns\n" "- 'Z' is missing from ES index mapping\n" "- 'a' column type ('keyword') not compatible with ES index mapping type ('long')" ) # Assert that the index isn't modified assert_pandas_eland_frame_equal(pd_df, df1)
def test_notna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) for column in self.columns: notna_ed_ecommerce = ed_ecommerce[ed_ecommerce[column].notna()] notna_pd_ecommerce = pd_ecommerce[pd_ecommerce[column].notna()] assert_pandas_eland_frame_equal(notna_pd_ecommerce, notna_ed_ecommerce)
def test_isna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) isna_ed_ecommerce = ed_ecommerce[ ed_ecommerce["geoip.region_name"].isna()] isna_pd_ecommerce = pd_ecommerce[ pd_ecommerce["geoip.region_name"].isna()] assert_pandas_eland_frame_equal(isna_pd_ecommerce, isna_ed_ecommerce)
def test_select_dtypes_exclude_number(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number]) pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number]) assert_pandas_eland_frame_equal( pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103) )
def test_getitem_query(self): # Examples from: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html pd_df = pd.DataFrame( { "A": range(1, 6), "B": range(10, 0, -2), "C": range(10, 5, -1) }, index=["0", "1", "2", "3", "4"], ) """ >>> pd_df A B C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 """ # Now create index index_name = "eland_test_query" ed_df = ed.pandas_to_eland(pd_df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) pd_df.info() ed_df.info() pd_q1 = pd_df[pd_df.A > 2] pd_q2 = pd_df[pd_df.A > pd_df.B] pd_q3 = pd_df[pd_df.B == pd_df.C] ed_q1 = ed_df[ed_df.A > 2] ed_q2 = ed_df[ed_df.A > ed_df.B] ed_q3 = ed_df[ed_df.B == ed_df.C] assert_pandas_eland_frame_equal(pd_q1, ed_q1) assert_pandas_eland_frame_equal(pd_q2, ed_q2) assert_pandas_eland_frame_equal(pd_q3, ed_q3) pd_q4 = pd_df[(pd_df.A > 2) & (pd_df.B > 3)] ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)] assert_pandas_eland_frame_equal(pd_q4, ed_q4) ES_TEST_CLIENT.indices.delete(index_name)
def test_datetime_to_ms(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], }, index=["0", "1", "2"], ) expected_mappings = { "mappings": { "properties": { "A": {"type": "double"}, "B": {"type": "long"}, "C": {"type": "keyword"}, "D": {"type": "date"}, "E": {"type": "double"}, "F": {"type": "boolean"}, "G": {"type": "long"}, } } } mappings = FieldMappings._generate_es_mappings(df) assert expected_mappings == mappings # Now create index index_name = "eland_test_generate_es_mappings" ed_df = ed.pandas_to_eland( df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True ) # print(df.to_string()) # print(ed_df.to_string()) # print(ed_df.dtypes) # print(ed_df.to_pandas().dtypes) assert_series_equal(df.dtypes, ed_df.dtypes) assert_pandas_eland_frame_equal(df, ed_df)
def test_es_if_exists_append(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, # We use 'short' here specifically so that the # assumed type of 'long' is coerced into a 'short' # by append mode. es_type_overrides={"a": "short"}, ) assert_pandas_eland_frame_equal(pd_df, df1) assert df1.shape == (3, 4) pd_df2 = pd.DataFrame( { "a": [4, 5, 6], "b": [-1.0, -2.0, -3.0], "c": ["A", "B", "C"], "d": [dt, dt - timedelta(1), dt - timedelta(2)], }, index=["3", "4", "5"], ) df2 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) # Assert that the second pandas dataframe is actually appended assert df2.shape == (6, 4) pd_df3 = pd_df.append(pd_df2) assert_pandas_eland_frame_equal(pd_df3, df2)
def test_tail_head(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_tail_10 = ed_flights.tail(10) pd_tail_10 = pd_flights.tail(10) assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10) ed_head_8 = ed_tail_10.head(8) pd_head_8 = pd_tail_10.head(8) assert_pandas_eland_frame_equal(pd_head_8, ed_head_8) ed_tail_5 = ed_head_8.tail(5) pd_tail_5 = pd_head_8.tail(5) assert_pandas_eland_frame_equal(pd_tail_5, ed_tail_5) ed_head_4 = ed_tail_5.head(4) pd_head_4 = pd_tail_5.head(4) assert_pandas_eland_frame_equal(pd_head_4, ed_head_4)
def test_tail(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_tail_10 = ed_flights.tail(10) pd_tail_10 = pd_flights.tail(10) assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10) ed_tail_8 = ed_tail_10.tail(8) pd_tail_8 = pd_tail_10.tail(8) assert_pandas_eland_frame_equal(pd_tail_8, ed_tail_8) ed_tail_20 = ed_tail_10.tail(20) pd_tail_20 = pd_tail_10.tail(20) assert_pandas_eland_frame_equal(pd_tail_20, ed_tail_20)
def test_head(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head_10 = ed_flights.head(10) pd_head_10 = pd_flights.head(10) assert_pandas_eland_frame_equal(pd_head_10, ed_head_10) ed_head_8 = ed_head_10.head(8) pd_head_8 = pd_head_10.head(8) assert_pandas_eland_frame_equal(pd_head_8, ed_head_8) ed_head_20 = ed_head_10.head(20) pd_head_20 = pd_head_10.head(20) assert_pandas_eland_frame_equal(pd_head_20, ed_head_20)