def __init__(self, data, vocab, hier=False, elmo=False, elmo_pre=None, deepmoji=False): self.id, self.X, self.y = data self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} if (self.y is None): self.y = None else: self.y = np.array( list(map(lambda label: self.emotion2label[label], self.y))) self.vocab = vocab self.num_total_seqs = len(self.X) self.tt = MyTokenizer() with open(VOCAB_PATH, 'r') as f: deepmoji_vocab = json.load(f) self.deepmoji_tt = SentenceTokenizer(deepmoji_vocab, 100) self.hier = hier self.elmo = elmo self.elmo_pre = elmo_pre # pre-extracted elmo embeddings self.deepmoji = deepmoji
def test_encode_texts(): """ Text encoding is stable. """ TEST_SENTENCES = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] maxlen = 30 batch_size = 32 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) encoding = model(tokenized) avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
def test_dataset_split_parameter(): """ Dataset is split in the desired ratios """ split_parameter = [0.7, 0.1, 0.2] st = SentenceTokenizer(vocab, 30) result, result_dicts, _ = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] assert len(train) == len(sentences) * split_parameter[0] assert len(val) == len(sentences) * split_parameter[1] assert len(test) == len(sentences) * split_parameter[2] assert len(train_dicts) == len(dicts) * split_parameter[0] assert len(val_dicts) == len(dicts) * split_parameter[1] assert len(test_dicts) == len(dicts) * split_parameter[2]
class MojiModel(nn.Module): def __init__(self, use_cuda=True): super(MojiModel, self).__init__() self.use_cuda = use_cuda self.EMOJIS = EMOJIS self.emoji_model = torchmoji_emojis(PRETRAINED_PATH) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.tokenizer = SentenceTokenizer(vocabulary, 100) print(self.emoji_model) self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH) if use_cuda: self.emoji_model = self.emoji_model.cuda() self.feat_model = self.feat_model.cuda() def predict(self, input_txt): input_txt = [input_txt] tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt) if self.use_cuda: tokenized = torch.cuda.LongTensor(tokenized.astype('int32')) prob = self.emoji_model(tokenized)[0] return prob def moji_feat(self, input_txt): input_txt = [input_txt] tokenized, _, _ = self.tokenizer.tokenize_sentences(input_txt) if self.use_cuda: tokenized = torch.cuda.LongTensor(tokenized.astype('int32')) return self.feat_model(tokenized)[0] def to_emoji(self, idx): return emoji.emojize(self.EMOJIS[idx], use_aliases=True)
def test_encode_texts(): """ Text encoding is stable. """ TEST_SENTENCES = ['I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit'] maxlen = 30 batch_size = 32 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) encoding = model(tokenized) avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005]))
async def predict_sentence_emojis(sentence: str, num_to_predict: int = 5) -> dict: """ Predict top n emojis based on the sentence :param sentence: sentence used in prediction :param num_to_predict: number of top emojis to return :return: Dictionary where key is predicted emoji and value is its probability """ with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, MAXLEN) model = torchmoji_emojis(PRETRAINED_PATH) print('Running predictions.') tokenized, _, _ = st.tokenize_sentences([sentence]) prob = model(tokenized)[0] ind_top = top_elements(prob, num_to_predict) emojis = list(map(lambda x: EMOJIS[x], ind_top)) # Might be useful if we need to send it this way # emojis_unicode_escape = [unicode_codes.EMOJI_ALIAS_UNICODE[emoj].encode('unicode-escape') for emoj in emojis] emojis_unicode = [ unicode_codes.EMOJI_ALIAS_UNICODE[emoj] for emoj in emojis ] return dict(zip(emojis_unicode, prob[ind_top]))
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset with open(path, 'rb') as dataset: if IS_PYTHON2: data = pickle.load(dataset) else: data = pickle.load(dataset, fix_imports=True) # Decode data try: texts = [unicode_(x) for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) return {'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen}
def load_benchmark(path, vocab, extend_with=0): """ Loads the given benchmark dataset. Tokenizes the texts using the provided vocabulary, extending it with words from the training dataset if extend_with > 0. Splits them into three lists: training, validation and testing (in that order). Also calculates the maximum length of the texts and the suggested batch_size. # Arguments: path: Path to the dataset to be loaded. vocab: Vocabulary to be used for tokenizing texts. extend_with: If > 0, the vocabulary will be extended with up to extend_with tokens from the training set before tokenizing. # Returns: A dictionary with the following fields: texts: List of three lists, containing tokenized inputs for training, validation and testing (in that order). labels: List of three lists, containing labels for training, validation and testing (in that order). added: Number of tokens added to the vocabulary. batch_size: Batch size. maxlen: Maximum length of an input. """ # Pre-processing dataset with open(path, 'rb') as dataset: if IS_PYTHON2: data = pickle.load(dataset) else: data = pickle.load(dataset, fix_imports=True) # Decode data try: texts = [unicode_(x) for x in data['texts']] except UnicodeDecodeError: texts = [x.decode('utf-8') for x in data['texts']] # Extract labels labels = [x['label'] for x in data['info']] batch_size, maxlen = calculate_batchsize_maxlen(texts) st = SentenceTokenizer(vocab, maxlen) # Split up dataset. Extend the existing vocabulary with up to extend_with # tokens from the training dataset. texts, labels, added = st.split_train_val_test(texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) return {'texts': texts, 'labels': labels, 'added': added, 'batch_size': batch_size, 'maxlen': maxlen}
def get_emotion_features_from_text(text, audio_filename): # https://github.com/huggingface/torchMoji/blob/master/examples/score_texts_emojis.py if text == '': emoji_ids = [] one_hot_encodings = [] else: text = [text] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] maxlen = 30 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) model = torchmoji_emojis(PRETRAINED_PATH) tokenized, _, _ = st.tokenize_sentences(text) prob = model(tokenized) for prob in [prob]: # Find top emojis for each sentence. Emoji ids (0-63) # correspond to the mapping in emoji_overview.png # at the root of the torchMoji repo. scores = [] for i, t in enumerate(text): t_tokens = tokenized[i] t_score = [t] t_prob = prob[i] ind_top = top_elements(t_prob, 5) t_score.append(sum(t_prob[ind_top])) t_score.extend(ind_top) t_score.extend([t_prob[ind] for ind in ind_top]) scores.append(t_score) emoji_ids = scores[0][2:2 + 5] one_hot_encodings = [] for emoji_idx in emoji_ids: one_hot_encodings.append( [0 if i != emoji_idx else 1 for i in range(64)]) a = audio_filename.split('/') filename = '/' + '/'.join( a[1:-1]) + '/onehot_emotion_' + a[-1].split('.wav')[0] + '.pkl' with open(filename, 'wb') as f: pickle.dump(one_hot_encodings, f) filename = '/' + '/'.join( a[1:-1]) + '/emoji_ids_' + a[-1].split('.wav')[0] + '.pkl' with open(filename, 'wb') as f: pickle.dump(emoji_ids, f) return emoji_ids, one_hot_encodings
def test_id_to_sentence(): """Tokenizing and converting back preserves the input. """ vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf basdf basdf' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == sentence
def __init__(self): # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.st = SentenceTokenizer(vocabulary, 30) # Loading model self.model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions self.dangoURL = "https://emoji.getdango.com/api/emoji?q="
def test_id_to_sentence_with_unknown(): """Tokenizing and converting back preserves the input, except for unknowns. """ vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf ccc' expected = 'aasdf basdf CUSTOM_UNKNOWN' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == expected
def __init__(self, use_cuda=True): super(MojiModel, self).__init__() self.use_cuda = use_cuda self.EMOJIS = EMOJIS self.emoji_model = torchmoji_emojis(PRETRAINED_PATH) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.tokenizer = SentenceTokenizer(vocabulary, 100) print(self.emoji_model) self.feat_model = torchmoji_feature_encoding(PRETRAINED_PATH) if use_cuda: self.emoji_model = self.emoji_model.cuda() self.feat_model = self.feat_model.cuda()
def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) self.accuracy = MicroMetrics(vocab) self.label_index_to_label = self.vocab.get_index_to_token_vocabulary( 'labels') final_concatenated_dimension = 64 * 3 self.input_layer = torch.nn.Linear( in_features=final_concatenated_dimension, out_features=64) self.output_layer = torch.nn.Linear( in_features=64, out_features=vocab.get_vocab_size("labels")) self.sigmoid = nn.Sigmoid() with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, 20) self.model = torchmoji_emojis(PRETRAINED_PATH)
def convert_dataset(filepath, extend_with, vocab): print('-- Generating {} '.format(filepath)) sys.stdout.flush() st = SentenceTokenizer(vocab, maxlen) tokenized, dicts, _ = st.split_train_val_test( texts, labels, [data['train_ind'], data['val_ind'], data['test_ind']], extend_with=extend_with) pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2], dicts[0], dicts[1], dicts[2]) with open(filepath, 'w') as f: pickle.dump(pick, f) cover = coverage(tokenized[2]) print(' done. Coverage: {}'.format(cover))
def load_torchmoji(self): """ Use torchMoji to score texts for emoji distribution. The resulting emoji ids (0-63) correspond to the mapping in emoji_overview.png file at the root of the torchMoji repo. Writes the result to a csv file. """ import json import numpy as np import os from torchmoji.sentence_tokenizer import SentenceTokenizer from torchmoji.model_def import torchmoji_feature_encoding from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) maxlen = 130 texts = [ "Testing!", ] with torch.no_grad(): # init model st = SentenceTokenizer(vocabulary, maxlen, ignore_sentences_with_only_custom=True) torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH) return st, torchmoji
def init_tokenizer_emotions(max_len): with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, max_len) model = torchmoji_emojis(PRETRAINED_PATH) return st, model
class EmotionBiLSTM(Model): def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) self.accuracy = MicroMetrics(vocab) self.label_index_to_label = self.vocab.get_index_to_token_vocabulary( 'labels') final_concatenated_dimension = 64 * 3 self.input_layer = torch.nn.Linear( in_features=final_concatenated_dimension, out_features=64) self.output_layer = torch.nn.Linear( in_features=64, out_features=vocab.get_vocab_size("labels")) self.sigmoid = nn.Sigmoid() with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, 20) self.model = torchmoji_emojis(PRETRAINED_PATH) def tokenize(self, sentences): tokenized, _, _ = self.st.tokenize_sentences(sentences) return torch.from_numpy(tokenized.astype(np.int)) def forward(self, turn1, turn2, turn3, conversation_id: str, turns: str, labels: torch.Tensor = None): #TODO Looku up reverse embedding of padded sequences turn1 = [x['turn1'] for x in turn1] turn2 = [x['turn2'] for x in turn2] turn3 = [x['turn3'] for x in turn3] predictions1 = self.model(self.tokenize(turn1)) predictions2 = self.model(self.tokenize(turn2)) predictions3 = self.model(self.tokenize(turn3)) predictions = torch.cat([predictions1, predictions2, predictions3], dim=1) input2hidden = self.input_layer(predictions) label_logits = self.sigmoid(self.output_layer(input2hidden)) # self.matrix_attention = self.matrix_attention(encoded_turn1and2, encoded_turn3) label_logits = F.softmax(label_logits, dim=1) output = { "prediction": [ self.label_index_to_label[x] for x in label_logits.argmax(dim=1).numpy() ], "ids": [x["ids"] for x in conversation_id], "turns": [x["turns"] for x in turns] } if labels is not None: #TODO check loss without and with mask self.accuracy(label_logits, labels) output["loss"] = cross_entropy_loss(label_logits, labels) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)}
def __init__(self, *args, **kwargs): HTTPServer.__init__(self, *args, **kwargs) with open(vocab_file_path, 'r') as f: vocabulary = json.load(f) max_sentence_length = 100 self.st = SentenceTokenizer(vocabulary, max_sentence_length) self.model = torchmoji_emojis(model_weights_path)
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
def __init__(self, counter, name, max_concurrent_queries): super().__init__(counter, name, max_concurrent_queries) sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0")) from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding from torchmoji.sentence_tokenizer import SentenceTokenizer self.log.debug("Loading model") with open(VOCAB_PATH, "r") as f: vocabulary = json.load(f) with torch.no_grad(): self.tm_sentence_tokenizer = SentenceTokenizer( vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True ) self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH) self.tm_model = torchmoji_emojis(PRETRAINED_PATH) self.log.debug("Model loaded")
def test(): def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") args = argparser.parse_args() # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([args.text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print( emoji.emojize("{} {}".format(args.text, ' '.join(emojis)), use_aliases=True))
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
def test_dataset_split_explicit(): """ Dataset is split according to given indices """ split_parameter = [train_ind, val_ind, test_ind] st = SentenceTokenizer(vocab, 30) tokenized, _, _ = st.tokenize_sentences(sentences) result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] tokenized = tokenized for i, sentence in enumerate(sentences): if i in train_ind: assert tokenized[i] in train assert dicts[i] in train_dicts elif i in val_ind: assert tokenized[i] in val assert dicts[i] in val_dicts elif i in test_ind: assert tokenized[i] in test assert dicts[i] in test_dicts assert len(train) == len(train_ind) assert len(val) == len(val_ind) assert len(test) == len(test_ind) assert len(train_dicts) == len(train_ind) assert len(val_dicts) == len(val_ind) assert len(test_dicts) == len(test_ind)
def text_to_emoji(text, maxlen): # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions tokenized, _, _ = st.tokenize_sentences([text]) # Get sentence probability prob = model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 5) # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print( emoji.emojize("{} {}".format(text, ' '.join(emojis)), use_aliases=True))
class EmojiPredictor(object): def __init__(self): # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) self.st = SentenceTokenizer(vocabulary, 30) # Loading model self.model = torchmoji_emojis(PRETRAINED_PATH) # Running predictions self.dangoURL = "https://emoji.getdango.com/api/emoji?q=" def getPredictedEmojis(self, text): api_response = '' try: #turned out that Dango has stopped the api service. #we might just use the deepmoji model r = requests.get("https://emoji.getdango.com/api/emoji", params={"q": text}) api_response = json.loads(r.text) except: pass if 'results' in api_response: res = [item['text'] for item in api_response['results']] if len(res) < 5: extraemojis = self.localPredict(text) for k in extraemojis: if k not in res: res.append(k) if len(res) == 5: return res else: return res[:5] else: return self.localPredict(text) def localPredict(self, text): tokenized, _, _ = self.st.tokenize_sentences([text.lower()]) # Get sentence probability prob = self.model(tokenized)[0] # Top emoji id emoji_ids = top_elements(prob, 6) np.setdiff1d(emoji_ids, [42]) if len(emoji_ids) > 5: emoji_ids = emoji_ids[:5] # map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) return emoji.emojize(' '.join(emojis), use_aliases=True).split()
def init(): global sentence_tokenizer global model global emoji_desc, emoji_unicode max_token = 30 with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) sentence_tokenizer = SentenceTokenizer(vocabulary, max_token) model = torchmoji_emojis(PRETRAINED_PATH) with open('data/emoji_codes.json') as f: emoji_desc = json.load(f) with open('data/wanted_emojis.csv') as f: emoji_unicode = list(csv.reader(f))
class Emoji(runner.Runner): name = "emoji" def __init__(self, counter, name, max_concurrent_queries): super().__init__(counter, name, max_concurrent_queries) sys.path.append(os.path.join(self.data_dir, "tacotron2-PPP-1.3.0")) from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH from torchmoji.model_def import torchmoji_emojis, torchmoji_feature_encoding from torchmoji.sentence_tokenizer import SentenceTokenizer self.log.debug("Loading model") with open(VOCAB_PATH, "r") as f: vocabulary = json.load(f) with torch.no_grad(): self.tm_sentence_tokenizer = SentenceTokenizer( vocabulary, MAX_LEN, ignore_sentences_with_only_custom=True ) self.tm_torchmoji = torchmoji_feature_encoding(PRETRAINED_PATH) self.tm_model = torchmoji_emojis(PRETRAINED_PATH) self.log.debug("Model loaded") async def func(self, request, **kwargs): text_batch = [self.normalize_input(request)] text_batch = [ text.replace('"', "") for text in text_batch ] # remove quotes from text tokenized, _, _ = self.tm_sentence_tokenizer.tokenize_sentences(text_batch) prob = self.tm_model(tokenized)[0] emoji_ids = top_elements(prob, 3) emojis = map(lambda x: EMOJIS[x], emoji_ids) emoji_score = [emoji.emojize(e, use_aliases=True) for e in emojis] return emoji_score
coverage_result = [p] print('Calculating coverage for {}'.format(p)) with open(p, 'rb') as f: if IS_PYTHON2: s = pickle.load(f) else: s = pickle.load(f, fix_imports=True) # Decode data try: s['texts'] = [unicode(x) for x in s['texts']] except UnicodeDecodeError: s['texts'] = [x.decode('utf-8') for x in s['texts']] # Own st = SentenceTokenizer({}, 30) tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=10000) coverage_result.append(coverage(tests[2])) # Last st = SentenceTokenizer(vocab, 30) tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'], [s['train_ind'], s['val_ind'], s['test_ind']], extend_with=0) coverage_result.append(coverage(tests[2]))
'I am sentence 7', 'I am sentence 8', 'I am sentence 9 newword', ] INFO_DICTS = [ {'label': 'sentence 0'}, {'label': 'sentence 1'}, {'label': 'sentence 2'}, {'label': 'sentence 3'}, {'label': 'sentence 4'}, {'label': 'sentence 5'}, {'label': 'sentence 6'}, {'label': 'sentence 7'}, {'label': 'sentence 8'}, {'label': 'sentence 9'}, ] with open('../model/vocabulary.json', 'r') as f: vocab = json.load(f) st = SentenceTokenizer(vocab, 30) # Split using the default split ratio print(st.split_train_val_test(DATASET, INFO_DICTS)) # Split explicitly print(st.split_train_val_test(DATASET, INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], extend_with=1))
TEST_SENTENCES = ['I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit'] maxlen = 30 batch_size = 32 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) print('Encoding texts..') encoding = model(tokenized) print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0])) print(encoding[0,:5]) # Now you could visualize the encodings to see differences, # run a logistic regression classifier on top, # or basically anything you'd like to do.
else: raise Exception('elmo model not recognized') elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda() elmo.eval() EMOS = EMO_LIST EMOS_DIC = dict(zip(EMOS, range(len(EMOS)))) tokenizer = GloveTokenizer() # deepmoji print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, PAD_LEN) print('Loading model from {}.'.format(PRETRAINED_PATH)) emoji_model = torchmoji_feature_encoding(PRETRAINED_PATH) emoji_model.eval() class EmotionDataLoader(Dataset): def __init__(self, X, y, pad_len, max_size=None): self.source = [] self.source_len = [] self.target = y self.pad_len = pad_len self.read_data(X, y) def read_data(self, X, y):
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, EMOJIS import json import numpy as np import emoji def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 300) model = torchmoji_emojis(PRETRAINED_PATH) def emojify_sentences(l): tokenized, _, _ = st.tokenize_sentences(l) prob = model(tokenized) result = [] for prob in [prob]: for i in range(len(l)): t_prob = prob[i] ind_top = top_elements(t_prob, 5) result.append( list([ emoji.emojize(EMOJIS[i], use_aliases=True),
GLOVE_EMB_PATH = opt.glovepath options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" ########## Set Assumptions ############ ########## Set Assumptions ############ ########## Get Elmo and Emoji Embedding ############ ########## Get Elmo and Emoji Embedding ############ elmo = Elmo(options_file, weight_file, 2, dropout=0).cuda() elmo.eval() print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) emoji_st = SentenceTokenizer(vocabulary, EMOJ_SENT_PAD_LEN) ########## Get Elmo and Emoji Embedding ############ ########## Get Elmo and Emoji Embedding ############ def load_data_context(data_path='data/train.txt', is_train=True): data_list = [] target_list = [] f_data = open(data_path, 'r') data_lines = f_data.readlines() f_data.close() for i, text in enumerate(data_lines): # skip the first line if i == 0:
""" Take a given list of sentences and turn it into a numpy array, where each number corresponds to a word. Padding is used (number 0) to ensure fixed length of sentences. """ from __future__ import print_function, unicode_literals import example_helper import json from torchmoji.sentence_tokenizer import SentenceTokenizer with open('../model/vocabulary.json', 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) test_sentences = [ '\u2014 -- \u203c !!\U0001F602', 'Hello world!', 'This is a sample tweet #example', ] tokens, infos, stats = st.tokenize_sentences(test_sentences) print(tokens) print(infos) print(stats)
'status_id', 'sum(numScore)', 'text', 'created_at', 'name' ]] #import and parse emoji codes emoji_codes = pd.read_json( '/Users/ikennedy/Work/UW/Code/GIT/cl_lda/twitter/emojicodes.json', orient='values', typ='series').str.extract(':(\w+):', expand=False).sort_index() #import vocab and model, define sentence tokenizer, set chunk_size os.getcwd() with open('/Users/ikennedy/Documents/GitHub/torchMoji/model/vocabulary.json' ) as f: vocab = json.load(f) model = torchmoji_emojis('twitter/pytorch_model.bin') st = SentenceTokenizer(vocab, 30) #specifiy colums for full df for: #twitter pull df_full = pd.DataFrame( columns=['sentiment', 'id', 'date', 'query', 'screen_name', 'text'] + list(emoji_codes)) #Twitter sample #df_full = pd.DataFrame(columns=['sentiment', 'text']+list(emoji_codes)) #runn in a loops of 5000 to avoid overusing computational resources chunk_size = 5000 i = 1000 chunk_size = 1000 for i in range(chunk_size, len(df) + chunk_size, chunk_size): if (i > len(df)): i = len(df)