Пример #1
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
Пример #2
0
    def load_torchmoji(self):
        """ Use torchMoji to score texts for emoji distribution.
        
        The resulting emoji ids (0-63) correspond to the mapping
        in emoji_overview.png file at the root of the torchMoji repo.
        
        Writes the result to a csv file.
        """
        import json
        import numpy as np
        import os
        from torchmoji.sentence_tokenizer import SentenceTokenizer
        from torchmoji.model_def import torchmoji_feature_encoding
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

        print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        maxlen = 130
        texts = [
            "Testing!",
        ]

        with torch.no_grad():
            # init model
            st = SentenceTokenizer(vocabulary,
                                   maxlen,
                                   ignore_sentences_with_only_custom=True)
            torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
        return st, torchmoji
Пример #3
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = ['I love mom\'s cooking',
                      'I love how you never reply back..',
                      'I love cruising with my homies',
                      'I love messing with yo mind!!',
                      'I love you and now you\'re just gone..',
                      'This is shit',
                      'This is the shit']


    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
Пример #4
0
 def __init__(self, use_cuda=True):
     super(MojiModel, self).__init__()
     self.use_cuda = use_cuda
     self.EMOJIS = EMOJIS
     self.emoji_model = torchmoji_emojis(PRETRAINED_PATH)
     with open(VOCAB_PATH, 'r') as f:
         vocabulary = json.load(f)
     self.tokenizer = SentenceTokenizer(vocabulary, 100)
     print(self.emoji_model)
     self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH)
     if use_cuda:
         self.emoji_model = self.emoji_model.cuda()
         self.feat_model = self.feat_model.cuda()
Пример #5
0
    def __init__(self,
                 vocab,
                 hidden_size,
                 num_layers,
                 max_length=700,
                 input_dropout=0.0,
                 layer_dropout=0.0,
                 is_bidirectional=False,
                 attentive=False,
                 multiattentive=True,
                 num_heads=5,
                 total_key_depth=500,
                 total_value_depth=1000,
                 use_mask=True):
        super(HDeepMoji, self).__init__()

        self.input_dropout = nn.Dropout(input_dropout)
        self.layer_dropout = nn.Dropout(layer_dropout)
        self.vocab = vocab

        self.torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
        embedding_size = 2304
        # self.lstm = nn.LSTM(embedding_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=is_bidirectional)
        self.lstm_layer = nn.LSTM(embedding_size,
                                  hidden_size=hidden_size,
                                  num_layers=num_layers,
                                  bidirectional=is_bidirectional,
                                  batch_first=False)
        self.W = nn.Linear(hidden_size *
                           2 if is_bidirectional else hidden_size,
                           4)  ## 4 emotion
        self.softmax = nn.Softmax(dim=1)

        self.num_layers = num_layers
        self.is_bidirectional = is_bidirectional

        self.use_mask = use_mask
        self.attentive = attentive
        if attentive:
            # self.word_attention = Attention(hidden_size*2 if is_bidirectional else hidden_size)
            self.sentences_attention = Attention(
                hidden_size * 2 if is_bidirectional else hidden_size)
Пример #6
0
    def __init__(self, counter, name, max_concurrent_queries):
        super().__init__(counter, name, max_concurrent_queries)

        sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0"))
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
        from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding
        from torchmoji.sentence_tokenizer import SentenceTokenizer

        self.log.debug("Loading model")

        with open(VOCAB_PATH, "r") as f:
            vocabulary = json.load(f)

        with torch.no_grad():
            self.tm_sentence_tokenizer = SentenceTokenizer(
                vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True
            )
            self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
            self.tm_model = torchmoji_emojis(PRETRAINED_PATH)

        self.log.debug("Model loaded")
Пример #7
0
elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda()
elmo.eval()

EMOS = EMO_LIST
EMOS_DIC = dict(zip(EMOS, range(len(EMOS))))

tokenizer = GloveTokenizer()

# deepmoji
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, PAD_LEN)

print('Loading model from {}.'.format(PRETRAINED_PATH))
emoji_model = torchmoji_feature_encoding(PRETRAINED_PATH)
emoji_model.eval()


class EmotionDataLoader(Dataset):
    def __init__(self, X, y, pad_len, max_size=None):
        self.source = []
        self.source_len = []
        self.target = y
        self.pad_len = pad_len
        self.read_data(X, y)

    def read_data(self, X, y):
        for src in X:
            src = tokenizer.encode_ids(src)
            if len(src) < self.pad_len:
Пример #8
0
TEST_SENTENCES = ['I love mom\'s cooking',
                  'I love how you never reply back..',
                  'I love cruising with my homies',
                  'I love messing with yo mind!!',
                  'I love you and now you\'re just gone..',
                  'This is shit',
                  'This is the shit']

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

print('Encoding texts..')
encoding = model(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0,:5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.