def test_check_estimator_fromkey(self): text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"], num=[ 1, 2, 3, 4, 5, 6, 7, 8])) tokey = ToKey() << ['text'] data_idv = tokey.fit_transform(text_df) assert data_idv is not None assert len(data_idv) > 0 assert str(sorted([str(dt) for dt in data_idv.dtypes]) ) == "['category', 'int64']" fromkey = FromKey() << ['text'] data = fromkey.fit_transform(data_idv) assert str(list(data_idv['text'])) == str(list(data['text'])) t = numpy.unique(data_idv['text'].cat.codes) assert len(t) == 6 assert list(data_idv['text'].cat.categories) == [ "cat", "dog", "fish", "orange", "cat orange", "spider"]
def test_check_estimator_fromkey_categories(self): text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"]), dtype="category") tokey = ToKey() << ['text'] data_idv = tokey.fit_transform(text_df) assert data_idv is not None assert len(data_idv) > 0 assert data_idv['text'].dtype == 'category'
############################################################################### # FromKey import pandas from nimbusml.preprocessing import FromKey, ToKey from pandas import Categorical # Create the data categorical_df = pandas.DataFrame(data=dict( key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']), text=['b', 'c', 'a', 'b', 'a', 'c'])) fromkey = FromKey(columns='key') y = fromkey.fit_transform(categorical_df) print(y) tokey = ToKey(columns='text') y = tokey.fit_transform(categorical_df) y2 = fromkey.clone().fit_transform(y) print(y2['text'] == categorical_df['text'])
import numpy from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing import ToKey # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, names={0: 'id'}) print(data.head()) # age case education id induced parity pooled.stratum spontaneous ... # 0 26.0 1.0 0-5yrs 1.0 1.0 6.0 3.0 2.0 ... # 1 42.0 1.0 0-5yrs 2.0 1.0 1.0 1.0 0.0 ... # 2 39.0 1.0 0-5yrs 3.0 2.0 6.0 4.0 0.0 ... # 3 34.0 1.0 0-5yrs 4.0 2.0 4.0 2.0 0.0 .. # 4 35.0 1.0 6-11yrs 5.0 1.0 3.0 32.0 1.0 .. # transform usage xf = ToKey(columns={'id_1': 'id', 'edu_1': 'education'}) # fit and transform features = xf.fit_transform(data) print(features.head()) # age case edu_1 education id id_1 induced parity ... # 0 26.0 1.0 0-5yrs 0-5yrs 1.0 0 1.0 6.0 ... # 1 42.0 1.0 0-5yrs 0-5yrs 2.0 1 1.0 1.0 ... # 2 39.0 1.0 0-5yrs 0-5yrs 3.0 2 2.0 6.0 ... # 3 34.0 1.0 0-5yrs 0-5yrs 4.0 3 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 6-11yrs 5.0 4 1.0 3.0 ...
############################################################################### # ToKey import pandas from nimbusml.preprocessing import ToKey # Create the data text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"])) tokey = ToKey() << 'text' y = tokey.fit_transform(text_df) print(y)