示例#1
0
    def test_NGramFeaturizer_glove(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        data = pd.DataFrame({
            'review': [
                'I like this movie', 'I don\'t like this', 'It is nice',
                'I like this movie', 'I don\'t like this', 'It is nice',
                'So boring'
            ],
            'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg']
        })
        pipeline = Pipeline([
            ('ng',
             NGramFeaturizer(
                 word_feature_extractor=Ngram(),
                 output_tokens_column_name='review_TransformedText',
                 columns='review')),
            WordEmbedding(columns='review_TransformedText',
                          model_kind='GloVe50D'),
            ('lr',
             FastLinearBinaryClassifier(
                 feature=['review', 'review_TransformedText'],
                 number_of_threads=1,
                 shuffle=False))
        ])

        param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20])
        grid = GridSearchCV(pipeline, param_grid)

        grid.fit(data['review'], 1 * (data['sentiment'] == 'pos'))
        assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
示例#2
0
    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)
示例#3
0
    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(word_feature_extractor=Ngram(),
                                vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])

ppl.fit(X_train, y_train)

scores = ppl.predict(X_test)['PredictedLabel']

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
示例#5
0
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens_column_name='review_TransformedText'),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
#    review_TransformedText.147  review_TransformedText.148  review_TransformedText.149
# 0                    1.918661                   -0.714531                    3.062141
# 1                    1.891922                   -0.248650                    1.706620
# 2                    1.601611                    0.309785                    3.379576
# 3                    1.970666                    1.477450                    3.110802
# 4                    2.521791                    0.122538                    3.129919
示例#6
0
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
         slots_in_output=2)  # only accept one column
 ]),
 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
                                    char_feature_extractor=Ngram(),
                                    keep_diacritics=True,
                                    columns={ 'features': ['SentimentText']}),
 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
示例#7
0
from nimbusml.feature_extraction.text.extractor import Ngram

# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

# transform usage
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='ngram_TransformedText',
                    columns={'ngram': ['SentimentText']}),

    WordEmbedding(columns='ngram_TransformedText')
])

# fit and transform
features = pipeline.fit_transform(data)

# print features
print(features.head())
#   Sentiment  ...       ngram.douchiest  ngram.award.
# 0          1 ...                   0.0           0.0
# 1          1 ...                   0.0           0.0
# 2          1 ...                   0.0           0.0
# 3          1 ...                   0.0           0.0
示例#8
0
    data=dict(
        review=[
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
# print(X.head())

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = ngram.transform(test_reviews)

scores = mymodel.predict(ngram.transform(test_reviews))

# view the scores
# print(scores.head())
示例#9
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(
    word_feature_extractor=Ngram(),
    vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])
ppl.fit(X_train, y_train)

# evaluate the model
metrics, scores = ppl.test(X_test, y_test, output_scores=True)

print(metrics)
示例#10
0
###############################################################################
# WordEmbedding: pre-trained transform to generate word embeddings
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import WordEmbedding
from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view the review embeddings
print(y)
示例#11
0
###############################################################################
# LightLda: cluster topics
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
topics = pandas.DataFrame(data=dict(review=[
    "animals birds cats dogs fish horse", "horse birds house fish duck cats",
    "car truck driver bus pickup", "car truck driver bus pickup horse ",
    "car truck", "bus pickup", "space galaxy universe radiation",
    "radiation galaxy universe duck"
]))

# there are three main topics in our data. set num_topic=3
# and see if LightLDA vectors for topics look similar
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None')
    << 'review',
    LightLda(num_topic=3)
])
y = pipeline.fit_transform(topics)

# view the LDA topic vectors
print(y)
    data=dict(review=[
        "This is great", "I hate it", "Love it", "Do not like it",
        "Really like it", "I hate it", "I like it a lot", "I kind of hate it",
        "I do like it", "I really hate it", "It is very good",
        "I hate it a bunch", "I love it a bunch", "I hate it",
        "I like it very much", "I hate it very much.", "I really do love it",
        "I really do hate it", "Love it!", "Hate it!", "I love it",
        "I hate it", "I love it", "I hate it", "I love it"
    ],
              like=[
                  True, False, True, False, True, False, True, False, True,
                  False, True, False, True, False, True, False, True, False,
                  True, False, True, False, True, False, True
              ]))

X = train_reviews.loc[:, train_reviews.columns != 'like']
y = train_reviews['like']

# pipeline of transforms
transform_1 = NGramFeaturizer(word_feature_extractor=Ngram())
transform_2 = MutualInformationSelector(slots_in_output=2)
pipeline = Pipeline([transform_1, transform_2])
print(pipeline.fit_transform(X, y))

# Scikit compatibility (Compose transforms inside Scikit Pipeline).
# In this scenario, we do not provide {input, output} arguments
transform_1 = NGramFeaturizer(word_feature_extractor=Ngram())
transform_2 = MutualInformationSelector(slots_in_output=2)
pipe = Pipeline([('text', transform_1), ('featureselect', transform_2)])
print(pipe.fit_transform(X, y))