示例#1
0
    def __init__(self,
                 data,
                 vocab,
                 hier=False,
                 elmo=False,
                 elmo_pre=None,
                 deepmoji=False):
        self.id, self.X, self.y = data
        self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

        if (self.y is None):
            self.y = None
        else:
            self.y = np.array(
                list(map(lambda label: self.emotion2label[label], self.y)))
        self.vocab = vocab
        self.num_total_seqs = len(self.X)
        self.tt = MyTokenizer()

        with open(VOCAB_PATH, 'r') as f:
            deepmoji_vocab = json.load(f)
        self.deepmoji_tt = SentenceTokenizer(deepmoji_vocab, 100)

        self.hier = hier
        self.elmo = elmo
        self.elmo_pre = elmo_pre  # pre-extracted elmo embeddings
        self.deepmoji = deepmoji
示例#2
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences,
                       np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences,
                                                      dicts,
                                                      split_parameter,
                                                      extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]
class MojiModel(nn.Module):
    def __init__(self, use_cuda=True):
        super(MojiModel, self).__init__()
        self.use_cuda = use_cuda
        self.EMOJIS = EMOJIS
        self.emoji_model = torchmoji_emojis(PRETRAINED_PATH)
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        self.tokenizer = SentenceTokenizer(vocabulary, 100)
        print(self.emoji_model)
        self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH)
        if use_cuda:
            self.emoji_model = self.emoji_model.cuda()
            self.feat_model = self.feat_model.cuda()

    def predict(self, input_txt):
        input_txt = [input_txt]
        tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt)
        if self.use_cuda:
            tokenized = torch.cuda.LongTensor(tokenized.astype('int32'))
        prob = self.emoji_model(tokenized)[0]
        return prob

    def moji_feat(self, input_txt):
        input_txt = [input_txt]
        tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt)
        if self.use_cuda:
            tokenized = torch.cuda.LongTensor(tokenized.astype('int32'))
        return self.feat_model(tokenized)[0]

    def to_emoji(self, idx):
        return emoji.emojize(self.EMOJIS[idx], use_aliases=True)
示例#5
0
def test_encode_texts():
    """ Text encoding is stable.
    """

    TEST_SENTENCES = ['I love mom\'s cooking',
                      'I love how you never reply back..',
                      'I love cruising with my homies',
                      'I love messing with yo mind!!',
                      'I love you and now you\'re just gone..',
                      'This is shit',
                      'This is the shit']


    maxlen = 30
    batch_size = 32

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_feature_encoding(PRETRAINED_PATH)
    print(model)
    tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
    encoding = model(tokenized)

    avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3)
    assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
async def predict_sentence_emojis(sentence: str,
                                  num_to_predict: int = 5) -> dict:
    """
    Predict top n emojis based on the sentence
    :param sentence: sentence used in prediction
    :param num_to_predict: number of top emojis to return
    :return: Dictionary where key is predicted emoji and value is its probability
    """

    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, MAXLEN)

    model = torchmoji_emojis(PRETRAINED_PATH)
    print('Running predictions.')
    tokenized, _, _ = st.tokenize_sentences([sentence])
    prob = model(tokenized)[0]

    ind_top = top_elements(prob, num_to_predict)
    emojis = list(map(lambda x: EMOJIS[x], ind_top))

    # Might be useful if we need to send it this way
    # emojis_unicode_escape = [unicode_codes.EMOJI_ALIAS_UNICODE[emoj].encode('unicode-escape') for emoj in emojis]

    emojis_unicode = [
        unicode_codes.EMOJI_ALIAS_UNICODE[emoj] for emoj in emojis
    ]
    return dict(zip(emojis_unicode, prob[ind_top]))
示例#7
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """
    # Pre-processing dataset
    with open(path, 'rb') as dataset:
        if IS_PYTHON2:
            data = pickle.load(dataset)
        else:
            data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [unicode_(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   [data['train_ind'],
                                                    data['val_ind'],
                                                    data['test_ind']],
                                                   extend_with=extend_with)
    return {'texts': texts,
            'labels': labels,
            'added': added,
            'batch_size': batch_size,
            'maxlen': maxlen}
示例#8
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """
    # Pre-processing dataset
    with open(path, 'rb') as dataset:
        if IS_PYTHON2:
            data = pickle.load(dataset)
        else:
            data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [unicode_(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   [data['train_ind'],
                                                    data['val_ind'],
                                                    data['test_ind']],
                                                   extend_with=extend_with)
    return {'texts': texts,
            'labels': labels,
            'added': added,
            'batch_size': batch_size,
            'maxlen': maxlen}
示例#9
0
def get_emotion_features_from_text(text, audio_filename):
    # https://github.com/huggingface/torchMoji/blob/master/examples/score_texts_emojis.py

    if text == '':
        emoji_ids = []
        one_hot_encodings = []
    else:
        text = [text]

        def top_elements(array, k):
            ind = np.argpartition(array, -k)[-k:]
            return ind[np.argsort(array[ind])][::-1]

        maxlen = 30

        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, maxlen)

        model = torchmoji_emojis(PRETRAINED_PATH)
        tokenized, _, _ = st.tokenize_sentences(text)
        prob = model(tokenized)

        for prob in [prob]:
            # Find top emojis for each sentence. Emoji ids (0-63)
            # correspond to the mapping in emoji_overview.png
            # at the root of the torchMoji repo.
            scores = []
            for i, t in enumerate(text):
                t_tokens = tokenized[i]
                t_score = [t]
                t_prob = prob[i]
                ind_top = top_elements(t_prob, 5)
                t_score.append(sum(t_prob[ind_top]))
                t_score.extend(ind_top)
                t_score.extend([t_prob[ind] for ind in ind_top])
                scores.append(t_score)

        emoji_ids = scores[0][2:2 + 5]
        one_hot_encodings = []
        for emoji_idx in emoji_ids:
            one_hot_encodings.append(
                [0 if i != emoji_idx else 1 for i in range(64)])

    a = audio_filename.split('/')

    filename = '/' + '/'.join(
        a[1:-1]) + '/onehot_emotion_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(one_hot_encodings, f)

    filename = '/' + '/'.join(
        a[1:-1]) + '/emoji_ids_' + a[-1].split('.wav')[0] + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(emoji_ids, f)

    return emoji_ids, one_hot_encodings
def test_id_to_sentence():
    """Tokenizing and converting back preserves the input.
    """
    vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf basdf basdf'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == sentence
示例#11
0
 def __init__(self):
     # Tokenizing using dictionary
     with open(VOCAB_PATH, 'r') as f:
         vocabulary = json.load(f)
     self.st = SentenceTokenizer(vocabulary, 30)
     # Loading model
     self.model = torchmoji_emojis(PRETRAINED_PATH)
     # Running predictions
     self.dangoURL = "https://emoji.getdango.com/api/emoji?q="
def test_id_to_sentence_with_unknown():
    """Tokenizing and converting back preserves the input, except for unknowns.
    """
    vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000}

    sentence = 'aasdf basdf ccc'
    expected = 'aasdf basdf CUSTOM_UNKNOWN'
    st = SentenceTokenizer(vb, 30)
    token, _, _ = st.tokenize_sentences([sentence])
    assert st.to_sentence(token[0]) == expected
示例#13
0
 def __init__(self, use_cuda=True):
     super(MojiModel, self).__init__()
     self.use_cuda = use_cuda
     self.EMOJIS = EMOJIS
     self.emoji_model = torchmoji_emojis(PRETRAINED_PATH)
     with open(VOCAB_PATH, 'r') as f:
         vocabulary = json.load(f)
     self.tokenizer = SentenceTokenizer(vocabulary, 100)
     print(self.emoji_model)
     self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH)
     if use_cuda:
         self.emoji_model = self.emoji_model.cuda()
         self.feat_model = self.feat_model.cuda()
示例#14
0
 def __init__(self, vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.accuracy = MicroMetrics(vocab)
     self.label_index_to_label = self.vocab.get_index_to_token_vocabulary(
         'labels')
     final_concatenated_dimension = 64 * 3
     self.input_layer = torch.nn.Linear(
         in_features=final_concatenated_dimension, out_features=64)
     self.output_layer = torch.nn.Linear(
         in_features=64, out_features=vocab.get_vocab_size("labels"))
     self.sigmoid = nn.Sigmoid()
     with open(VOCAB_PATH, 'r') as f:
         self.vocabulary = json.load(f)
         self.st = SentenceTokenizer(self.vocabulary, 20)
     self.model = torchmoji_emojis(PRETRAINED_PATH)
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
示例#16
0
    def load_torchmoji(self):
        """ Use torchMoji to score texts for emoji distribution.
        
        The resulting emoji ids (0-63) correspond to the mapping
        in emoji_overview.png file at the root of the torchMoji repo.
        
        Writes the result to a csv file.
        """
        import json
        import numpy as np
        import os
        from torchmoji.sentence_tokenizer import SentenceTokenizer
        from torchmoji.model_def import torchmoji_feature_encoding
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

        print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        maxlen = 130
        texts = [
            "Testing!",
        ]

        with torch.no_grad():
            # init model
            st = SentenceTokenizer(vocabulary,
                                   maxlen,
                                   ignore_sentences_with_only_custom=True)
            torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
        return st, torchmoji
示例#17
0
def init_tokenizer_emotions(max_len):
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, max_len)
    model = torchmoji_emojis(PRETRAINED_PATH)

    return st, model
示例#18
0
class EmotionBiLSTM(Model):
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.accuracy = MicroMetrics(vocab)
        self.label_index_to_label = self.vocab.get_index_to_token_vocabulary(
            'labels')
        final_concatenated_dimension = 64 * 3
        self.input_layer = torch.nn.Linear(
            in_features=final_concatenated_dimension, out_features=64)
        self.output_layer = torch.nn.Linear(
            in_features=64, out_features=vocab.get_vocab_size("labels"))
        self.sigmoid = nn.Sigmoid()
        with open(VOCAB_PATH, 'r') as f:
            self.vocabulary = json.load(f)
            self.st = SentenceTokenizer(self.vocabulary, 20)
        self.model = torchmoji_emojis(PRETRAINED_PATH)

    def tokenize(self, sentences):
        tokenized, _, _ = self.st.tokenize_sentences(sentences)
        return torch.from_numpy(tokenized.astype(np.int))

    def forward(self,
                turn1,
                turn2,
                turn3,
                conversation_id: str,
                turns: str,
                labels: torch.Tensor = None):
        #TODO Looku up reverse embedding of padded sequences
        turn1 = [x['turn1'] for x in turn1]
        turn2 = [x['turn2'] for x in turn2]
        turn3 = [x['turn3'] for x in turn3]
        predictions1 = self.model(self.tokenize(turn1))
        predictions2 = self.model(self.tokenize(turn2))
        predictions3 = self.model(self.tokenize(turn3))
        predictions = torch.cat([predictions1, predictions2, predictions3],
                                dim=1)
        input2hidden = self.input_layer(predictions)
        label_logits = self.sigmoid(self.output_layer(input2hidden))

        # self.matrix_attention = self.matrix_attention(encoded_turn1and2, encoded_turn3)
        label_logits = F.softmax(label_logits, dim=1)
        output = {
            "prediction": [
                self.label_index_to_label[x]
                for x in label_logits.argmax(dim=1).numpy()
            ],
            "ids": [x["ids"] for x in conversation_id],
            "turns": [x["turns"] for x in turns]
        }
        if labels is not None:
            #TODO check loss without and with mask
            self.accuracy(label_logits, labels)
            output["loss"] = cross_entropy_loss(label_logits, labels)
        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}
示例#19
0
    def __init__(self, *args, **kwargs):
        HTTPServer.__init__(self, *args, **kwargs)
        with open(vocab_file_path, 'r') as f:
            vocabulary = json.load(f)

        max_sentence_length = 100

        self.st = SentenceTokenizer(vocabulary, max_sentence_length)
        self.model = torchmoji_emojis(model_weights_path)
示例#20
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking',
        'I love how you never reply back..',
        'I love cruising with my homies',
        'I love messing with yo mind!!',
        'I love you and now you\'re just gone..',
        'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36,  4,  8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31,  6, 30, 15, 13]),
        np.array([54, 44,  9, 50, 49]),
        np.array([46,  5, 27, 35, 34]),
        np.array([55, 32, 27,  1, 37]),
        np.array([48, 11,  6, 31,  9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
示例#21
0
    def __init__(self, counter, name, max_concurrent_queries):
        super().__init__(counter, name, max_concurrent_queries)

        sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0"))
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
        from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding
        from torchmoji.sentence_tokenizer import SentenceTokenizer

        self.log.debug("Loading model")

        with open(VOCAB_PATH, "r") as f:
            vocabulary = json.load(f)

        with torch.no_grad():
            self.tm_sentence_tokenizer = SentenceTokenizer(
                vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True
            )
            self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
            self.tm_model = torchmoji_emojis(PRETRAINED_PATH)

        self.log.debug("Model loaded")
示例#22
0
def test():
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    if __name__ == "__main__":
        argparser = argparse.ArgumentParser()
        argparser.add_argument('--text',
                               type=str,
                               required=True,
                               help="Input text to emojize")
        argparser.add_argument('--maxlen',
                               type=int,
                               default=30,
                               help="Max length of input text")
        args = argparser.parse_args()

        # Tokenizing using dictionary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, args.maxlen)

        # Loading model
        model = torchmoji_emojis(PRETRAINED_PATH)
        # Running predictions
        tokenized, _, _ = st.tokenize_sentences([args.text])
        # Get sentence probability
        prob = model(tokenized)[0]

        # Top emoji id
        emoji_ids = top_elements(prob, 5)

        # map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)

        print(
            emoji.emojize("{} {}".format(args.text, ' '.join(emojis)),
                          use_aliases=True))
示例#23
0
def test_score_emoji():
    """ Emoji predictions make sense.
    """
    test_sentences = [
        'I love mom\'s cooking', 'I love how you never reply back..',
        'I love cruising with my homies', 'I love messing with yo mind!!',
        'I love you and now you\'re just gone..', 'This is shit',
        'This is the shit'
    ]

    expected = [
        np.array([36, 4, 8, 16, 47]),
        np.array([1, 19, 55, 25, 46]),
        np.array([31, 6, 30, 15, 13]),
        np.array([54, 44, 9, 50, 49]),
        np.array([46, 5, 27, 35, 34]),
        np.array([55, 32, 27, 1, 37]),
        np.array([48, 11, 6, 31, 9])
    ]

    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    # Initialize by loading dictionary and tokenize texts
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, 30)
    tokens, _, _ = st.tokenize_sentences(test_sentences)

    # Load model and run
    model = torchmoji_emojis(weight_path=PRETRAINED_PATH)
    prob = model(tokens)

    # Find top emojis for each sentence
    for i, t_prob in enumerate(list(prob)):
        assert np.array_equal(top_elements(t_prob, 5), expected[i])
def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences,
                                                          dicts,
                                                          split_parameter,
                                                          extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    tokenized = tokenized

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)
def text_to_emoji(text, maxlen):
    # Tokenizing using dictionary
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    # Loading model
    model = torchmoji_emojis(PRETRAINED_PATH)
    # Running predictions
    tokenized, _, _ = st.tokenize_sentences([text])
    # Get sentence probability
    prob = model(tokenized)[0]

    # Top emoji id
    emoji_ids = top_elements(prob, 5)

    # map to emojis
    emojis = map(lambda x: EMOJIS[x], emoji_ids)

    print(
        emoji.emojize("{} {}".format(text, ' '.join(emojis)),
                      use_aliases=True))
示例#26
0
class EmojiPredictor(object):
    def __init__(self):
        # Tokenizing using dictionary
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)
        self.st = SentenceTokenizer(vocabulary, 30)
        # Loading model
        self.model = torchmoji_emojis(PRETRAINED_PATH)
        # Running predictions
        self.dangoURL = "https://emoji.getdango.com/api/emoji?q="

    def getPredictedEmojis(self, text):
        api_response = ''
        try:
            #turned out that Dango has stopped the api service.
            #we might just use the deepmoji model
            r = requests.get("https://emoji.getdango.com/api/emoji",
                             params={"q": text})
            api_response = json.loads(r.text)
        except:
            pass

        if 'results' in api_response:
            res = [item['text'] for item in api_response['results']]
            if len(res) < 5:
                extraemojis = self.localPredict(text)
                for k in extraemojis:
                    if k not in res:
                        res.append(k)
                    if len(res) == 5:
                        return res
            else:
                return res[:5]
        else:
            return self.localPredict(text)

    def localPredict(self, text):
        tokenized, _, _ = self.st.tokenize_sentences([text.lower()])
        # Get sentence probability
        prob = self.model(tokenized)[0]
        # Top emoji id
        emoji_ids = top_elements(prob, 6)
        np.setdiff1d(emoji_ids, [42])
        if len(emoji_ids) > 5:
            emoji_ids = emoji_ids[:5]
        # map to emojis
        emojis = map(lambda x: EMOJIS[x], emoji_ids)
        return emoji.emojize(' '.join(emojis), use_aliases=True).split()
示例#27
0
def init():
    global sentence_tokenizer
    global model
    global emoji_desc, emoji_unicode

    max_token = 30
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    
    sentence_tokenizer = SentenceTokenizer(vocabulary, max_token)
    model = torchmoji_emojis(PRETRAINED_PATH)

    with open('data/emoji_codes.json') as f:
        emoji_desc = json.load(f)

    with open('data/wanted_emojis.csv') as f:
        emoji_unicode = list(csv.reader(f))
示例#28
0
class Emoji(runner.Runner):
    name = "emoji"

    def __init__(self, counter, name, max_concurrent_queries):
        super().__init__(counter, name, max_concurrent_queries)

        sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0"))
        from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
        from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding
        from torchmoji.sentence_tokenizer import SentenceTokenizer

        self.log.debug("Loading model")

        with open(VOCAB_PATH, "r") as f:
            vocabulary = json.load(f)

        with torch.no_grad():
            self.tm_sentence_tokenizer = SentenceTokenizer(
                vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True
            )
            self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH)
            self.tm_model = torchmoji_emojis(PRETRAINED_PATH)

        self.log.debug("Model loaded")

    async def func(self, request, **kwargs):
        text_batch = [self.normalize_input(request)]
        text_batch = [
            text.replace('"', "") for text in text_batch
        ]  # remove quotes from text
        tokenized, _, _ = self.tm_sentence_tokenizer.tokenize_sentences(text_batch)
        prob = self.tm_model(tokenized)[0]
        emoji_ids = top_elements(prob, 3)
        emojis = map(lambda x: EMOJIS[x], emoji_ids)
        emoji_score = [emoji.emojize(e, use_aliases=True) for e in emojis]
        return emoji_score
示例#29
0
    coverage_result = [p]
    print('Calculating coverage for {}'.format(p))
    with open(p, 'rb') as f:
        if IS_PYTHON2:
            s = pickle.load(f)
        else:
            s = pickle.load(f, fix_imports=True)

    # Decode data
    try:
        s['texts'] = [unicode(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
                                              [s['train_ind'],
                                               s['val_ind'],
                                               s['test_ind']],
                                              extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
                                              [s['train_ind'],
                                               s['val_ind'],
                                               s['test_ind']],
                                              extend_with=0)
    coverage_result.append(coverage(tests[2]))
示例#30
0
    'I am sentence 7',
    'I am sentence 8',
    'I am sentence 9 newword',
    ]

INFO_DICTS = [
    {'label': 'sentence 0'},
    {'label': 'sentence 1'},
    {'label': 'sentence 2'},
    {'label': 'sentence 3'},
    {'label': 'sentence 4'},
    {'label': 'sentence 5'},
    {'label': 'sentence 6'},
    {'label': 'sentence 7'},
    {'label': 'sentence 8'},
    {'label': 'sentence 9'},
    ]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(st.split_train_val_test(DATASET,
                              INFO_DICTS,
                              [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                              extend_with=1))
示例#31
0
TEST_SENTENCES = ['I love mom\'s cooking',
                  'I love how you never reply back..',
                  'I love cruising with my homies',
                  'I love messing with yo mind!!',
                  'I love you and now you\'re just gone..',
                  'This is shit',
                  'This is the shit']

maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)

print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

print('Encoding texts..')
encoding = model(tokenized)

print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
print(encoding[0,:5])

# Now you could visualize the encodings to see differences,
# run a logistic regression classifier on top,
# or basically anything you'd like to do.
示例#32
0
else:
    raise Exception('elmo model not recognized')

elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda()
elmo.eval()

EMOS = EMO_LIST
EMOS_DIC = dict(zip(EMOS, range(len(EMOS))))

tokenizer = GloveTokenizer()

# deepmoji
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, PAD_LEN)

print('Loading model from {}.'.format(PRETRAINED_PATH))
emoji_model = torchmoji_feature_encoding(PRETRAINED_PATH)
emoji_model.eval()


class EmotionDataLoader(Dataset):
    def __init__(self, X, y, pad_len, max_size=None):
        self.source = []
        self.source_len = []
        self.target = y
        self.pad_len = pad_len
        self.read_data(X, y)

    def read_data(self, X, y):
示例#33
0
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, EMOJIS

import json
import numpy as np
import emoji


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 300)
model = torchmoji_emojis(PRETRAINED_PATH)


def emojify_sentences(l):
    tokenized, _, _ = st.tokenize_sentences(l)
    prob = model(tokenized)

    result = []
    for prob in [prob]:
        for i in range(len(l)):
            t_prob = prob[i]
            ind_top = top_elements(t_prob, 5)
            result.append(
                list([
                    emoji.emojize(EMOJIS[i], use_aliases=True),
示例#34
0
GLOVE_EMB_PATH = opt.glovepath

options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
##########  Set Assumptions ############
##########  Set Assumptions ############

##########  Get Elmo and Emoji Embedding ############
##########  Get Elmo and Emoji Embedding ############
elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda()
elmo.eval()

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
emoji_st = SentenceTokenizer(vocabulary, EMOJ_SENT_PAD_LEN)
##########  Get Elmo and Emoji Embedding ############
##########  Get Elmo and Emoji Embedding ############


def load_data_context(data_path='data/train.txt', is_train=True):

    data_list = []
    target_list = []
    f_data = open(data_path, 'r')
    data_lines = f_data.readlines()
    f_data.close()

    for i, text in enumerate(data_lines):
        # skip the first line
        if i == 0:
示例#35
0
"""
Take a given list of sentences and turn it into a numpy array, where each
number corresponds to a word. Padding is used (number 0) to ensure fixed length
of sentences.
"""

from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer

with open('../model/vocabulary.json', 'r') as f:
    vocabulary = json.load(f)

st = SentenceTokenizer(vocabulary, 30)
test_sentences = [
    '\u2014 -- \u203c !!\U0001F602',
    'Hello world!',
    'This is a sample tweet #example',
    ]

tokens, infos, stats = st.tokenize_sentences(test_sentences)

print(tokens)
print(infos)
print(stats)
示例#36
0
    'status_id', 'sum(numScore)', 'text', 'created_at', 'name'
]]

#import and parse emoji codes
emoji_codes = pd.read_json(
    '/Users/ikennedy/Work/UW/Code/GIT/cl_lda/twitter/emojicodes.json',
    orient='values',
    typ='series').str.extract(':(\w+):', expand=False).sort_index()

#import vocab and model, define sentence tokenizer, set chunk_size
os.getcwd()
with open('/Users/ikennedy/Documents/GitHub/torchMoji/model/vocabulary.json'
          ) as f:
    vocab = json.load(f)
model = torchmoji_emojis('twitter/pytorch_model.bin')
st = SentenceTokenizer(vocab, 30)

#specifiy colums for full df for:
#twitter pull
df_full = pd.DataFrame(
    columns=['sentiment', 'id', 'date', 'query', 'screen_name', 'text'] +
    list(emoji_codes))
#Twitter sample
#df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes))
#runn in a loops of 5000 to avoid overusing computational resources
chunk_size = 5000
i = 1000
chunk_size = 1000
for i in range(chunk_size, len(df) + chunk_size, chunk_size):
    if (i > len(df)):
        i = len(df)