def test_word_embedding(self): ds_train = pandas.DataFrame(data=dict( description=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) ng = NGramFeaturizer(columns=['description'], output_tokens=True) we = WordEmbedding(columns='description_TransformedText', model_kind='Sswe') model = Pipeline([ng, we]) dot_vis = dot_export_pipeline(model, ds_train) assert 'ch1[label="<f0> description|<f1> ' \ 'description_TransformedText"' in dot_vis
def test_NGramFeaturizer_glove(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) data = pd.DataFrame({ 'review': [ 'I like this movie', 'I don\'t like this', 'It is nice', 'I like this movie', 'I don\'t like this', 'It is nice', 'So boring' ], 'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg'] }) pipeline = Pipeline([ ('ng', NGramFeaturizer( word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText', columns='review')), WordEmbedding(columns='review_TransformedText', model_kind='GloVe50D'), ('lr', FastLinearBinaryClassifier( feature=['review', 'review_TransformedText'], number_of_threads=1, shuffle=False)) ]) param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20]) grid = GridSearchCV(pipeline, param_grid) grid.fit(data['review'], 1 * (data['sentiment'] == 'pos')) assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
def test_ssweembedding(self): wordvectors = pd.DataFrame(data=dict(w1=["like", "hate", "okay"], w2=["great", "horrible", "lukewarm"], w3=["awesome", "worst", "boring"])) mycols = ['w1', 'w2', 'w3'] concat = ColumnConcatenator() << {'features': mycols} sswe = WordEmbedding() << 'features' pipeline = Pipeline([concat, sswe]) y = pipeline.fit_transform(wordvectors) y = y[[col for col in y.columns if 'features' in col]] assert_almost_equal(y.sum().sum(), -97.6836, decimal=4, err_msg="Sum should be %s" % -97.6836)
def test_word_embedding_example2(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns)
def test_word_embedding_example_dict_newname(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? WordEmbedding( columns={ 'features_TransformedText2': 'features_TransformedText'}) ]) features = pipeline.fit_transform(data) assert features.shape == (248, 409)
import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram # create the data customer_reviews = pandas.DataFrame(data=dict(review=[ "I really did not like the taste of it", "It was surprisingly quite good!", "I will never ever ever go to that place again!!", "The best ever!! It was amazingly good and super fast", "I wish I had gone earlier, it was that great", "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!" ])) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText'), WordEmbedding() << 'review_TransformedText' ]) y = pipeline.fit_transform(customer_reviews) # view a small subset of the review embeddings print(y.iloc[:5, -3:]) # review_TransformedText.147 review_TransformedText.148 review_TransformedText.149 # 0 1.918661 -0.714531 3.062141 # 1 1.891922 -0.248650 1.706620 # 2 1.601611 0.309785 3.379576 # 3 1.970666 1.477450 3.110802 # 4 2.521791 0.122538 3.129919
# data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... # transform usage pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'ngram': ['SentimentText']}), WordEmbedding(columns='ngram_TransformedText') ]) # fit and transform features = pipeline.fit_transform(data) # print features print(features.head()) # Sentiment ... ngram.douchiest ngram.award. # 0 1 ... 0.0 0.0 # 1 1 ... 0.0 0.0 # 2 1 ... 0.0 0.0 # 3 1 ... 0.0 0.0 # 4 1 ... 0.0 0.0
data = FileDataStream(path, schema=file_schema) print(data.head()) # Sentiment SentimentText # 0 1.0 ==RUDE== Dude, you are rude upload that carl p... # 1 1.0 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1.0 Stop trolling, zapatancas, calling me a liar m... # 3 1.0 ==You're cool== You seem like a really cool g... # 4 1.0 ::::: Why are you threatening me? I'm not bein... # After using Character Tokenizer, it will convert the vector of Char to Key type. # Use FromKey to retrieve the data from Key first, then send into WordEmbedding. pipe = Pipeline([ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}), WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}), ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature']) ]) print(pipe.fit_transform(data).head()) # Sentiment ... Feature.149 # 0 1.0 ... 2.67440 # 1 1.0 ... 0.78858 # 2 1.0 ... 2.67440 # 3 1.0 ... 2.67440 # 4 1.0 ... 2.67440 # [5 rows x 152 columns]