def test_es_if_exists_append_mapping_mismatch(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) with pytest.raises(ValueError) as e: pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) assert str(e.value) == ( "DataFrame dtypes and Elasticsearch index mapping aren't compatible:\n" "- 'b' is missing from DataFrame columns\n" "- 'c' is missing from DataFrame columns\n" "- 'd' is missing from DataFrame columns\n" "- 'Z' is missing from ES index mapping\n" "- 'a' column type ('keyword') not compatible with ES index mapping type ('long')" ) # Assert that the index isn't modified assert_pandas_eland_frame_equal(pd_df, df1)
def test_es_if_exists_replace(self): # Assert that 'replace' allows for creation df1 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(pd_df2, df1) # Assert that 'replace' will replace existing mapping and entries df2 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ) assert_pandas_eland_frame_equal(pd_df, df2) df3 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="replace", es_refresh=True, ).to_pandas() assert_frame_equal(df1, df3)
def test_es_if_exists_append_es_type_coerce_error(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, es_type_overrides={"a": "byte"}, ) assert_pandas_eland_frame_equal(pd_df, df1) pd_df_short = pd.DataFrame( { "a": [128], # This value is too large for 'byte' "b": [-1.0], "c": ["A"], "d": [dt], }, index=["3"], ) with pytest.raises(BulkIndexError) as e: pandas_to_eland( pd_df_short, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", ) # Assert that the value 128 caused the index error assert "Value [128] is out of range for a byte" in str(e.value)
def test_es_type_override_error(self): df = self.pd_flights().filter([ "AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry" ]) index_name = "test_es_type_override" match = "'DistanceKilometers', 'DistanceMiles' column(s) not in given dataframe" with pytest.raises(KeyError) as e: ed.pandas_to_eland( df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True, use_pandas_index_for_es_ids=False, es_type_overrides={ "AvgTicketPrice": "long", "DistanceKilometers": "text", "DistanceMiles": "text", }, ) assert str(e.value) == match ES_TEST_CLIENT.indices.delete(index=index_name)
def test_es_if_exists_fail(self): pandas_to_eland(pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index") with pytest.raises(ValueError) as e: pandas_to_eland(pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index") assert str(e.value) == ( "Could not create the index [test-index] because it " "already exists. Change the 'es_if_exists' parameter " "to 'append' or 'replace' data." )
def save_output(self, df): """ This will save data file to elasticsearch output index :param df: Dataframe to be saved in ES. :return: Pandas dataframe containing raw data """ # Save result to ES self.log.info('Saving Output: to {}'.format(self.config.dataloader.output_index)) df = df.set_index(df.id) ed.pandas_to_eland(df, self.config.dataloader.elasticsearch_host, self.config.dataloader.output_index, es_if_exists="append", es_refresh=True)
def test_returns_eland_dataframe(self): df = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index" ) assert isinstance(df, DataFrame) assert "es_index_pattern: test-index" in df.es_info()
def test_generate_es_mappings(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], }, index=["0", "1", "2"], ) expected_mappings = { "mappings": { "properties": { "A": { "type": "double" }, "B": { "type": "long" }, "C": { "type": "keyword" }, "D": { "type": "date" }, "E": { "type": "double" }, "F": { "type": "boolean" }, "G": { "type": "long" }, } } } mappings = FieldMappings._generate_es_mappings(df) assert expected_mappings == mappings # Now create index index_name = "eland_test_generate_es_mappings" ed_df = ed.pandas_to_eland(df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) ES_TEST_CLIENT.indices.delete(index=index_name)
def test_multiitem_query(self): # Examples from: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html pd_df = pd.DataFrame( { "A": range(1, 6), "B": range(10, 0, -2), "C": range(10, 5, -1) }, index=["0", "1", "2", "3", "4"], ) """ >>> pd_df A B C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 """ # Now create index index_name = "eland_test_query" ed_df = ed.pandas_to_eland(pd_df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) pd_df.info() ed_df.info() pd_q1 = pd_df[pd_df.A > 2] pd_q2 = pd_df[pd_df.A > pd_df.B] pd_q3 = pd_df[pd_df.B == pd_df.C] ed_q1 = ed_df[ed_df.A > 2] ed_q2 = ed_df[ed_df.A > ed_df.B] ed_q3 = ed_df[ed_df.B == ed_df.C] assert_pandas_eland_frame_equal(pd_q1, ed_q1) assert_pandas_eland_frame_equal(pd_q2, ed_q2) assert_pandas_eland_frame_equal(pd_q3, ed_q3) ed_q4 = ed_q1.query("B > 2") pd_q4 = pd_q1.query("B > 2") assert_pandas_eland_frame_equal(pd_q4, ed_q4) # Drop rows by index ed_q4 = ed_q4.drop(["2"]) pd_q4 = pd_q4.drop(["2"]) assert_pandas_eland_frame_equal(pd_q4, ed_q4) ES_TEST_CLIENT.indices.delete(index_name)
def build_index(filepath): """Load the corresponding csv file into a pandas dataframe and query each point against our sd_geo index appending the median lat/longs returned """ df = pd.read_csv(filepath).fillna("") df = build_datetimes(df) # filters out only points with a valid address address_points = df.loc[(df.address_number_primary != 0)] address_points["location"] = address_points.apply(lambda x: get_closest_lat_long(x), axis=1) eland.pandas_to_eland( pd_df=address_points, es_client=es, es_dropna=True, es_dest_index=Path(filepath).stem, es_if_exists="replace", # so that these items are overwriting existing records. This allows us to update existing indexes. es_type_overrides={"date_time": "date"}, )
def test_es_if_exists_append(self): df1 = pandas_to_eland( pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, # We use 'short' here specifically so that the # assumed type of 'long' is coerced into a 'short' # by append mode. es_type_overrides={"a": "short"}, ) assert_pandas_eland_frame_equal(pd_df, df1) assert df1.shape == (3, 4) pd_df2 = pd.DataFrame( { "a": [4, 5, 6], "b": [-1.0, -2.0, -3.0], "c": ["A", "B", "C"], "d": [dt, dt - timedelta(1), dt - timedelta(2)], }, index=["3", "4", "5"], ) df2 = pandas_to_eland( pd_df2, es_client=ES_TEST_CLIENT, es_dest_index="test-index", es_if_exists="append", es_refresh=True, ) # Assert that the second pandas dataframe is actually appended assert df2.shape == (6, 4) pd_df3 = pd_df.append(pd_df2) assert_pandas_eland_frame_equal(pd_df3, df2)
def test_pandas_to_eland_text_inserts_keyword(self): es = ES_TEST_CLIENT df1 = pandas_to_eland( pd_df, es_client=es, es_dest_index="test-index", es_if_exists="append", es_refresh=True, es_type_overrides={ "c": "text", "b": {"type": "float"}, "d": {"type": "text"}, }, ) assert es.indices.get_mapping(index="test-index") == { "test-index": { "mappings": { "properties": { "a": {"type": "long"}, "b": {"type": "float"}, "c": { "fields": {"keyword": {"type": "keyword"}}, "type": "text", }, "d": {"type": "text"}, } } } } # 'c' is aggregatable on 'keyword' assert df1.groupby("c").mean().to_dict() == { "a": {"A": 1.0, "B": 2.0, "C": 3.0}, "b": {"A": 1.0, "B": 2.0, "C": 3.0}, } # 'd' isn't aggregatable because it's missing the 'keyword' with pytest.raises(ValueError) as e: df1.groupby("d").mean() assert str(e.value) == ( "Cannot use 'd' with groupby() because it has " "no aggregatable fields in Elasticsearch" )
def test_pandas_to_eland_ignore_index(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], "H": "Long text", # text "I": "52.36,4.83", # geo point }, index=["0", "1", "2"], ) # Now create index index_name = "test_pandas_to_eland_ignore_index" ed_df = ed.pandas_to_eland( df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True, use_pandas_index_for_es_ids=False, es_type_overrides={ "H": "text", "I": "geo_point" }, ) # Check types expected_mapping = { "test_pandas_to_eland_ignore_index": { "mappings": { "properties": { "A": { "type": "double" }, "B": { "type": "long" }, "C": { "type": "keyword" }, "D": { "type": "date" }, "E": { "type": "double" }, "F": { "type": "boolean" }, "G": { "type": "long" }, "H": { "type": "text" }, "I": { "type": "geo_point" }, } } } } mapping = ES_TEST_CLIENT.indices.get_mapping(index_name) assert expected_mapping == mapping # Convert back to pandas and compare with original pd_df = ed.eland_to_pandas(ed_df) # Compare values excluding index assert df.values.all() == pd_df.values.all() # Ensure that index is populated by ES. assert not (df.index == pd_df.index).any() ES_TEST_CLIENT.indices.delete(index=index_name)