Python SentenceTokenizer.SentenceTokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: deepmoji.sentence_tokenizer

Класс/Тип: SentenceTokenizer

Метод/Функция: SentenceTokenizer

Примеров на hotexamples.com: 4

Python SentenceTokenizer.SentenceTokenizer - 4 примера найдено. Это лучшие примеры Python кода для deepmoji.sentence_tokenizer.SentenceTokenizer.SentenceTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

tokenize_sentences(27)

split_train_val_test(7)

SentenceTokenizer(4)

to_sentence(2)

get_test_sentences(1)

Пример #1

Показать файл

from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

TEST_SENTENCES = [
    u'I love mom\'s cooking', u'I love how you never reply back..',
    u'I love cruising with my homies', u'I love messing with yo mind!!',
    u'I love you and now you\'re just gone..', u'This is shit',
    u'This is the shit'
]

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

print('Encoding texts..')
encoding = model.predict(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.

Пример #2

Показать файл

Файл: load_finetuned.py Проект: akhisud/DeepMoji

from keras.utils import CustomObjectScope
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.attlayer import AttentionWeightedAverage
import codecs
import sys
import numpy as np
TEST_SENTENCES = codecs.open(sys.argv[1], 'r', encoding='utf-8').readlines()
print('Loading model: ', sys.argv[2])
with CustomObjectScope({'AttentionWeightedAverage': AttentionWeightedAverage}):
    model = load_model(sys.argv[2])
model.summary()

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)

st = SentenceTokenizer(vocab, 20)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Reverse tokenizing.'.format(PRETRAINED_PATH), file=sys.stdout)
st_id2tok = [None] * len(st.vocabulary)
for w in st.vocabulary:
    st_id2tok[st.vocabulary[w]] = w
tokenized_sents = [[st_id2tok[i] for i in row] for row in tokenized]

layer_name = 'my_layer'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('attlayer').output)

print('Running predictions.')
prob = model.predict(tokenized)
_, att_weights = intermediate_layer_model.predict(tokenized)

Пример #3

Показать файл

results = []
for p in DATASET_PATHS:
    coverage_result = [p]
    print('Calculating coverage for {}'.format(p))
    with open(p) as f:
        s = pickle.load(f)

    # Decode data
    try:
        s['texts'] = [unicode(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=0)
    coverage_result.append(coverage(tests[2]))

    # Full

Пример #4

Показать файл

"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function
import thesis_helper
import json
from deepmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    u'\u2014 -- \u203c !!\U0001F602', u'Hello world!',
    u'This is a sample tweet #example',
    u'A long sentence that does not have a meaningful ending but nonetheless is pretty.'
]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)