def test_generate_es_mappings(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], }, index=["0", "1", "2"], ) expected_mappings = { "mappings": { "properties": { "A": { "type": "double" }, "B": { "type": "long" }, "C": { "type": "keyword" }, "D": { "type": "date" }, "E": { "type": "double" }, "F": { "type": "boolean" }, "G": { "type": "long" }, } } } mappings = FieldMappings._generate_es_mappings(df) assert expected_mappings == mappings # Now create index index_name = "eland_test_generate_es_mappings" ed_df = ed.pandas_to_eland(df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) ES_TEST_CLIENT.indices.delete(index=index_name)
def pandas_to_eland( pd_df: pd.DataFrame, es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch], es_dest_index: str, es_if_exists: str = "fail", es_refresh: bool = False, es_dropna: bool = False, es_type_overrides: Optional[Mapping[str, str]] = None, thread_count: int = 4, chunksize: Optional[int] = None, use_pandas_index_for_es_ids: bool = True, ) -> DataFrame: """ Append a pandas DataFrame to an Elasticsearch index. Mainly used in testing. Modifies the elasticsearch destination index Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance es_dest_index: str Name of Elasticsearch index to be appended to es_if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the index already exists. - fail: Raise a ValueError. - replace: Delete the index before inserting new values. - append: Insert new values to the existing index. Create if does not exist. es_refresh: bool, default 'False' Refresh es_dest_index after bulk index es_dropna: bool, default 'False' * True: Remove missing values (see pandas.Series.dropna) * False: Include missing values - may cause bulk to fail es_type_overrides: dict, default None Dict of field_name: es_data_type that overrides default es data types thread_count: int number of the threads to use for the bulk requests chunksize: int, default None Number of pandas.DataFrame rows to read before bulk index into Elasticsearch use_pandas_index_for_es_ids: bool, default 'True' * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields. * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch Returns ------- eland.Dataframe eland.DataFrame referencing data in destination_index Examples -------- >>> pd_df = pd.DataFrame(data={'A': 3.141, ... 'B': 1, ... 'C': 'foo', ... 'D': pd.Timestamp('20190102'), ... 'E': [1.0, 2.0, 3.0], ... 'F': False, ... 'G': [1, 2, 3], ... 'H': 'Long text - to be indexed as es type text'}, ... index=['0', '1', '2']) >>> type(pd_df) <class 'pandas.core.frame.DataFrame'> >>> pd_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> pd_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`. Overwrite existing Elasticsearch index if it exists `if_exists="replace"`, and sync index so it is readable on return `refresh=True` >>> ed_df = ed.pandas_to_eland(pd_df, ... 'localhost', ... 'pandas_to_eland', ... es_if_exists="replace", ... es_refresh=True, ... es_type_overrides={'H':'text'}) # index field 'H' as text not keyword >>> type(ed_df) <class 'eland.dataframe.DataFrame'> >>> ed_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> ed_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object See Also -------- eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ if chunksize is None: chunksize = DEFAULT_CHUNK_SIZE mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides) es_client = ensure_es_client(es_client) # If table exists, check if_exists parameter if es_client.indices.exists(index=es_dest_index): if es_if_exists == "fail": raise ValueError( f"Could not create the index [{es_dest_index}] because it " f"already exists. " f"Change the 'es_if_exists' parameter to " f"'append' or 'replace' data." ) elif es_if_exists == "replace": es_client.indices.delete(index=es_dest_index) es_client.indices.create(index=es_dest_index, body=mapping) elif es_if_exists == "append": dest_mapping = es_client.indices.get_mapping(index=es_dest_index)[ es_dest_index ] verify_mapping_compatibility( ed_mapping=mapping, es_mapping=dest_mapping, es_type_overrides=es_type_overrides, ) else: es_client.indices.create(index=es_dest_index, body=mapping) def action_generator( pd_df: pd.DataFrame, es_dropna: bool, use_pandas_index_for_es_ids: bool, es_dest_index: str, ) -> Generator[Dict[str, Any], None, None]: for row in pd_df.iterrows(): if es_dropna: values = row[1].dropna().to_dict() else: values = row[1].to_dict() if use_pandas_index_for_es_ids: # Use index as _id id = row[0] action = {"_index": es_dest_index, "_source": values, "_id": str(id)} else: action = {"_index": es_dest_index, "_source": values} yield action # parallel_bulk is lazy generator so use deque to consume them immediately # maxlen = 0 because don't need results of parallel_bulk deque( parallel_bulk( client=es_client, actions=action_generator( pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index ), thread_count=thread_count, chunk_size=int(chunksize / thread_count), ), maxlen=0, ) if es_refresh: es_client.indices.refresh(index=es_dest_index) return DataFrame(es_client, es_dest_index)