def test_from_pandas_index(data): import pandas as pd df = pd.DataFrame([[x[1]] for x in data], index=[x[0] for x in data]) fields = [Field("text_field", keep_raw=True, tokenizer="split")] ds = DiskBackedDataset.from_pandas( df, fields, index_field=Field("number_field", tokenizer=None, keep_raw=True) ) assert set(ds.field_dict) == set(["text_field", "number_field"]) for original, (raw, _) in zip(data, ds.number_field): assert original[0] == raw
def test_from_pandas_field_dict(data): import pandas as pd df = pd.DataFrame(data, columns=["number", "text"]) fields = { "number": Field("number", tokenizer=None), "text": Field("text", keep_raw=True, tokenizer="split"), } ds = DiskBackedDataset.from_pandas(df, fields) for original, (raw, _) in zip(data, ds.text): assert original[1] == raw