Python WordEmbedding 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nimbusml.feature_extraction.text

클래스/타입: WordEmbedding

hotexamples.com에서의 예제들: 8

Python WordEmbedding - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nimbusml.feature_extraction.text.WordEmbedding에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

WordEmbedding(8)

자주 사용되는 메소드들

WordEmbedding (8)

예제 #1

파일 보기

파일: test_exports.py 프로젝트: zyw400/NimbusML-1

    def test_word_embedding(self):

        ds_train = pandas.DataFrame(data=dict(
            description=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        ng = NGramFeaturizer(columns=['description'], output_tokens=True)
        we = WordEmbedding(columns='description_TransformedText',
                           model_kind='Sswe')

        model = Pipeline([ng, we])
        dot_vis = dot_export_pipeline(model, ds_train)
        assert 'ch1[label="<f0> description|<f1> ' \
               'description_TransformedText"' in dot_vis

예제 #2

파일 보기

    def test_NGramFeaturizer_glove(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        data = pd.DataFrame({
            'review': [
                'I like this movie', 'I don\'t like this', 'It is nice',
                'I like this movie', 'I don\'t like this', 'It is nice',
                'So boring'
            ],
            'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg']
        })
        pipeline = Pipeline([
            ('ng',
             NGramFeaturizer(
                 word_feature_extractor=Ngram(),
                 output_tokens_column_name='review_TransformedText',
                 columns='review')),
            WordEmbedding(columns='review_TransformedText',
                          model_kind='GloVe50D'),
            ('lr',
             FastLinearBinaryClassifier(
                 feature=['review', 'review_TransformedText'],
                 number_of_threads=1,
                 shuffle=False))
        ])

        param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20])
        grid = GridSearchCV(pipeline, param_grid)

        grid.fit(data['review'], 1 * (data['sentiment'] == 'pos'))
        assert grid.best_params_['lr__maximum_number_of_iterations'] == 100

예제 #3

파일 보기

 def test_ssweembedding(self):
     wordvectors = pd.DataFrame(data=dict(w1=["like", "hate", "okay"],
                                          w2=["great", "horrible",
                                              "lukewarm"],
                                          w3=["awesome", "worst",
                                              "boring"]))
     mycols = ['w1', 'w2', 'w3']
     concat = ColumnConcatenator() << {'features': mycols}
     sswe = WordEmbedding() << 'features'
     pipeline = Pipeline([concat, sswe])
     y = pipeline.fit_transform(wordvectors)
     y = y[[col for col in y.columns if 'features' in col]]
     assert_almost_equal(y.sum().sum(), -97.6836, decimal=4,
                         err_msg="Sum should be %s" % -97.6836)

예제 #4

파일 보기

파일: test_wordembedding.py 프로젝트: zyw400/NimbusML-1

    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)

예제 #5

파일 보기

    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)

예제 #6

파일 보기

파일: WordEmbedding_df.py 프로젝트: yazici/NimbusML

import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens_column_name='review_TransformedText'),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
#    review_TransformedText.147  review_TransformedText.148  review_TransformedText.149
# 0                    1.918661                   -0.714531                    3.062141
# 1                    1.891922                   -0.248650                    1.706620
# 2                    1.601611                    0.309785                    3.379576
# 3                    1.970666                    1.477450                    3.110802
# 4                    2.521791                    0.122538                    3.129919

예제 #7

파일 보기

파일: WordEmbedding.py 프로젝트: zyw400/NimbusML-1

# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

# transform usage
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens=True,
                    columns={'ngram': ['SentimentText']}),
    WordEmbedding(columns='ngram_TransformedText')
])

# fit and transform
features = pipeline.fit_transform(data)

# print features
print(features.head())
#   Sentiment  ...       ngram.douchiest  ngram.award.
# 0          1 ...                   0.0           0.0
# 1          1 ...                   0.0           0.0
# 2          1 ...                   0.0           0.0
# 3          1 ...                   0.0           0.0
# 4          1 ...                   0.0           0.0

예제 #8

파일 보기

data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
        WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}),
        ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature'])
        ])

print(pipe.fit_transform(data).head())

#    Sentiment  ... Feature.149
# 0        1.0  ...     2.67440
# 1        1.0  ...     0.78858
# 2        1.0  ...     2.67440
# 3        1.0  ...     2.67440
# 4        1.0  ...     2.67440

# [5 rows x 152 columns]