Extend the given vocabulary using dataset-specific words. 1. First create a vocabulary for the specific dataset. 2. Find all words not in our vocabulary, but in the dataset vocabulary. 3. Take top X (default=1000) of these words and add them to the vocabulary. 4. Save this combined vocabulary and embedding matrix, which can now be used. """ from __future__ import print_function import json from deepmoji.create_vocab import extend_vocab, VocabBuilder from deepmoji.word_generator import WordGenerator new_words = [u'#zzzzaaazzz', u'newword', u'newword'] word_gen = WordGenerator(new_words) vb = VocabBuilder(word_gen) vb.count_all_words() with open('../model/vocabulary.json') as f: vocab = json.load(f) print(len(vocab)) print(vb.word_counts) extend_vocab(vocab, vb, max_tokens=1) # 'newword' should be added because it's more frequent in the given vocab print(vocab[u'newword']) print(len(vocab))
def split_train_val_test(self, sentences, info_dicts, split_parameter=[0.7, 0.1, 0.2], extend_with=0): """ Splits given sentences into three different datasets: training, validation and testing. # Arguments: sentences: The sentences to be tokenized. info_dicts: A list of dicts that contain information about each sentence (e.g. a label). split_parameter: A parameter for deciding the splits between the three different datasets. If instead of being passed three values, three lists are passed, then these will be used to specify which observation belong to which dataset. extend_with: An optional parameter. If > 0 then this is the number of tokens added to the vocabulary from this dataset. The expanded vocab will be generated using only the training set, but is applied to all three sets. # Returns: List of three lists of tokenized sentences, List of three corresponding dictionaries with information, How many tokens have been added to the vocab. Make sure to extend the embedding layer of the model accordingly. """ # If passed three lists, use those directly if isinstance(split_parameter, list) and \ all(isinstance(x, list) for x in split_parameter) and \ len(split_parameter) == 3: # Helper function to verify provided indices are numbers in range def verify_indices(inds): return list( filter( lambda i: isinstance(i, numbers.Number) and i < len( sentences), inds)) ind_train = verify_indices(split_parameter[0]) ind_val = verify_indices(split_parameter[1]) ind_test = verify_indices(split_parameter[2]) else: # Split sentences and dicts ind = list(range(len(sentences))) ind_train, ind_test = train_test_split( ind, test_size=split_parameter[2]) ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) # Map indices to data train = np.array([sentences[x] for x in ind_train]) test = np.array([sentences[x] for x in ind_test]) val = np.array([sentences[x] for x in ind_val]) info_train = np.array([info_dicts[x] for x in ind_train]) info_test = np.array([info_dicts[x] for x in ind_test]) info_val = np.array([info_dicts[x] for x in ind_val]) added = 0 # Extend vocabulary with training set tokens if extend_with > 0: wg = WordGenerator(train) vb = VocabBuilder(wg) vb.count_all_words() added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) # Wrap results result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] result_infos = [info_train, info_val, info_test] return result, result_infos, added