Пример #1
0
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

TEST_SENTENCES = [
    u'I love mom\'s cooking', u'I love how you never reply back..',
    u'I love cruising with my homies', u'I love messing with yo mind!!',
    u'I love you and now you\'re just gone..', u'This is shit',
    u'This is the shit'
]

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

print('Encoding texts..')
encoding = model.predict(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0, :5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.
Пример #2
0
from keras.utils import CustomObjectScope
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.attlayer import AttentionWeightedAverage
import codecs
import sys
import numpy as np
TEST_SENTENCES = codecs.open(sys.argv[1], 'r', encoding='utf-8').readlines()
print('Loading model: ', sys.argv[2])
with CustomObjectScope({'AttentionWeightedAverage': AttentionWeightedAverage}):
    model = load_model(sys.argv[2])
model.summary()

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)

st = SentenceTokenizer(vocab, 20)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Reverse tokenizing.'.format(PRETRAINED_PATH), file=sys.stdout)
st_id2tok = [None] * len(st.vocabulary)
for w in st.vocabulary:
    st_id2tok[st.vocabulary[w]] = w
tokenized_sents = [[st_id2tok[i] for i in row] for row in tokenized]

layer_name = 'my_layer'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('attlayer').output)

print('Running predictions.')
prob = model.predict(tokenized)
_, att_weights = intermediate_layer_model.predict(tokenized)
Пример #3
0
results = []
for p in DATASET_PATHS:
    coverage_result = [p]
    print('Calculating coverage for {}'.format(p))
    with open(p) as f:
        s = pickle.load(f)

    # Decode data
    try:
        s['texts'] = [unicode(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=0)
    coverage_result.append(coverage(tests[2]))

    # Full
Пример #4
0
"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function
import thesis_helper
import json
from deepmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    u'\u2014 -- \u203c !!\U0001F602', u'Hello world!',
    u'This is a sample tweet #example',
    u'A long sentence that does not have a meaningful ending but nonetheless is pretty.'
]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)