def test_torchmoji_return_attention(): seq_tensor = np.array([[1]]) # test the output of the normal model model = torchmoji_emojis(weight_path=PRETRAINED_PATH) # check correct number of outputs assert len(model(seq_tensor)) == 1 # repeat above described tests when returning attention weights model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True) assert len(model(seq_tensor)) == 2
def __init__(self): """ Ctor. """ # Automatically download weights if not os.path.isfile(PRETRAINED_PATH): os.system( "(cd torchMoji && python scripts/download_weights_yes.py)") # Instanciate a pytorch model self._model = torchmoji_emojis(weight_path=PRETRAINED_PATH) # Load vocabulary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) # Create tokenizer to split a sentence into words self._st = SentenceTokenizer(vocabulary, self._max_message_len_words) # Load a mapping in neural network prediction to smileys emoji_codes_path = os.path.join(ROOT_PATH, "data", "emoji_codes.json") with open(emoji_codes_path, 'r') as f: self._emoji_codes = json.load(f) # This is a reduction of 64 smileys into there "happiness" bool flag with open("sentiment.json", 'r') as f: self._sentiments = json.load(f) pass
def __init__(self): with open(vocab_file_path, 'r') as f: vocabulary = json.load(f) max_sentence_length = 100 self.st = SentenceTokenizer(vocabulary, max_sentence_length) self.model = torchmoji_emojis(model_weights_path)
def __init__(self, max_sentence_length=30): # Tokenizing using the dictionary with open(VOCAB_PATH, 'r') as f: self.vocabulary = json.load(f) self.st = SentenceTokenizer(self.vocabulary, max_sentence_length) # Loading the model self.model = torchmoji_emojis(PRETRAINED_PATH)
def test_score_emoji(): """ Emoji predictions make sense. """ test_sentences = [ 'I love mom\'s cooking', 'I love how you never reply back..', 'I love cruising with my homies', 'I love messing with yo mind!!', 'I love you and now you\'re just gone..', 'This is shit', 'This is the shit' ] expected = [ np.array([36, 4, 8, 16, 47]), np.array([1, 19, 55, 25, 46]), np.array([31, 6, 30, 15, 13]), np.array([54, 44, 9, 50, 49]), np.array([46, 5, 27, 35, 34]), np.array([55, 32, 27, 1, 37]), np.array([48, 11, 6, 31, 9]) ] def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] # Initialize by loading dictionary and tokenize texts with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, 30) tokens, _, _ = st.tokenize_sentences(test_sentences) # Load model and run model = torchmoji_emojis(weight_path=PRETRAINED_PATH) prob = model(tokens) # Find top emojis for each sentence for i, t_prob in enumerate(list(prob)): assert np.array_equal(top_elements(t_prob, 5), expected[i])
args = argparser.parse_args() sentence_probs = [] retokenized_sentences = [] output_path = os.path.join(os.path.dirname(args.filepath), 'sentence_emojis.pkl') retokenized_sentences_output_path = os.path.join( os.path.dirname(args.filepath), 'retokenized_sentences.pkl') # Tokenizing using dictionary with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, args.maxlen) # Loading model model = torchmoji_emojis(PRETRAINED_PATH) sentences = load_pickle(args.filepath) # TODO: encode multiple sentences at once. # Needs TorchMoji module to handle empty sentences and output equal probabilities # flattened_sentences = [utterance for conversation in sentences for utterance in conversation] # print('Encoding sentences ...') # flattened_tokenized, _, _ = st.tokenize_sentences(flattened_sentences) # flattened_probs = model(flattened_tokenized) # print('TorchMoji encoding done.') idx = 0 for conversation in sentences: idx += 1 conversation_probs = [] conversation_retokenized = [] for sentence in conversation: