def test_numeric_columns(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.float32) xf = OneHotHashVectorizer( columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}, number_of_bits=2) xf.fit_transform(data) xf = OneHotHashVectorizer( columns=[ 'education', 'induced', 'spontaneous'], number_of_bits=2) xf.fit_transform(data)
True, False, True, False, True, False, True ])) test_reviews = pandas.DataFrame(data=dict(review=[ "This is great", "I hate it", "Love it", "Really like it", "I hate it", "I like it a lot", "I love it", "I do like it", "I really hate it", "I love it" ])) # OneHotHashVectorizer transform: the entire string is treated as a category. # if output column name is same as input column, original input column values # are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] cat = OneHotHashVectorizer(number_of_bits=6) << 'review' X = cat.fit_transform(X) # view the transformed numerical values and column names print(X) mymodel = LogisticRegressionBinaryClassifier().fit(X, y) X_test = cat.transform(test_reviews) scores = mymodel.predict(cat.transform(test_reviews)) # view the scores print(scores)
############################################################################### # OneHotHashVectorizer from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotHashVectorizer # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', dtype={'spontaneous': str }) # Error with numeric input for ohhv print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... xf = OneHotHashVectorizer(columns={'edu': 'education', 'sp': 'spontaneous'}) # fit and transform features = xf.fit_transform(data) print(features.head()) # age case edu.0 edu.1003 ... sp.995 ... spontaneous stratum # 0 26 1 0.0 0.0 ... 0.0 ... 2.0 1.0 # 1 42 1 0.0 0.0 ... 0.0 ... 0.0 2.0 # 2 39 1 0.0 0.0 ... 0.0 ... 0.0 3.0