def test_load_embd(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] dict_generator = get_dicts_generator( word_min_freq=1, char_min_freq=1, word_ignore_case=False, char_ignore_case=False, ) for sentence in sentences: dict_generator(sentence) word_dict, char_dict, _ = dict_generator(return_dict=True) current = os.path.dirname(os.path.abspath(__file__)) word_embd_file_path = os.path.join(current, 'demo_word_embd.txt') weights = get_embedding_weights_from_file(word_dict, word_embd_file_path, ignore_case=True) self.assertEqual((len(word_dict), 3), weights.shape) self.assertEqual([0.1, 0.2, 0.3], weights[word_dict['All']].tolist()) self.assertEqual([0.4, 0.5, 0.6], weights[word_dict['work']].tolist()) self.assertEqual([0.7, 0.8, 0.9], weights[word_dict['and']].tolist()) char_embd_file_path = os.path.join(current, 'demo_char_embd.txt') weights = get_embedding_weights_from_file(char_dict, char_embd_file_path, ignore_case=True) self.assertEqual((len(char_dict), 3), weights.shape) self.assertEqual([0.1, 0.2, 0.3], weights[char_dict['A']].tolist()) self.assertEqual([0.4, 0.5, 0.6], weights[char_dict['l']].tolist()) self.assertEqual([0.7, 0.8, 0.9], weights[char_dict['w']].tolist())
def test_ignore_case(self): sentences = [ ['All', 'work', 'and', 'no', 'play', ''], ['all', 'worK', 'and', 'no', 'play', '.'], ] dict_generator = get_dicts_generator( word_min_freq=2, char_min_freq=2, word_ignore_case=True, char_ignore_case=True, ) for sentence in sentences: dict_generator(sentence) word_dict, char_dict, max_word_len = dict_generator(return_dict=True) self.assertEqual(4, max_word_len) self.assertEqual(7, len(word_dict)) self.assertTrue('all' in word_dict) self.assertTrue('work' in word_dict) self.assertTrue('k' in char_dict)
def test_no_word(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] dict_generator = get_dicts_generator( word_min_freq=2, char_min_freq=2, word_ignore_case=False, char_ignore_case=False, ) for sentence in sentences: dict_generator(sentence) word_dict, char_dict, max_word_len = dict_generator(return_dict=True) self.assertEqual(0, max_word_len) self.assertEqual(2, len(word_dict)) self.assertTrue('u' not in char_dict) self.assertTrue('A' not in char_dict) self.assertTrue('n' in char_dict) self.assertTrue('a' in char_dict)
targets_test) # one-hot encoding for class labels onehot_train, onehot_dev, onehot_test = to_categorical( encoded_labels_train), to_categorical(encoded_labels_dev), to_categorical( encoded_labels_test) print("max sequence length:", max(len(s) for s in sentences_train)) print("min sequence length:", min(len(s) for s in sentences_train)) s = sorted(len(s) for s in sentences_train) print("median sequence length:", s[len(s) // 2]) del train #--- Generate dictionaries for words and characters dicts_generator = get_dicts_generator( word_min_freq=5, char_min_freq=2, word_ignore_case=True, char_ignore_case=False, ) for sentence in sentences_train: dicts_generator(get_word_list_eng(sentence)) word_dict, char_dict, max_word_len = dicts_generator( return_dict=True ) #dict object here are word2index dict(or char2index), gives index of word(char) in the vocabulary print('Word dict size: %d Char dict size: %d Max word len: %d' % (len(word_dict), len(char_dict), max_word_len)) #--- Write word and char dict to json files with open(WORD_DICT, 'a') as output_wd: json.dump(word_dict, output_wd, ensure_ascii=False) output_wd.write('\n') with open(CHAR_DICT, 'a') as output_cd:
def train(self, data=None, *args, **kwargs): """ This method is for training the cnn model. After training procedure, the model will be saved in model.h5 file :param data: is not used in this method since the training and validating files has been given in read_dataset() method :return: None """ dicts_generator = get_dicts_generator(word_min_freq=2, char_min_freq=2, word_ignore_case=True, char_ignore_case=False) for sentence in self.train_sentences: dicts_generator(sentence) self.word_dict, self.char_dict, self.max_word_len = dicts_generator( return_dict=True) if os.path.exists(self.WORD_EMBD_PATH): print('Embedding...') self.word_dict = { '': 0, '<UNK>': 1, } with codecs.open(self.WORD_EMBD_PATH, 'r', 'utf8') as reader: print('Embedding open file') for line in reader: line = line.strip() if not line: continue word = line.split()[0].lower() if word not in self.word_dict: self.word_dict[word] = len(self.word_dict) print('Embedding for loop') self.word_embd_weights = get_embedding_weights_from_file( self.word_dict, self.WORD_EMBD_PATH, ignore_case=True, ) print('Embedding done') else: self.word_embd_weights = None raise NameError('embedding file is not found') print('Embedding all done') train_steps = (len(self.train_sentences) + self.BATCH_SIZE - 1) // self.BATCH_SIZE valid_steps = (len(self.valid_sentences) + self.BATCH_SIZE - 1) // self.BATCH_SIZE self.model = build_model(rnn_num=self.RNN_NUM, rnn_units=self.RNN_UNITS, word_dict_len=len(self.word_dict), char_dict_len=len(self.char_dict), max_word_len=self.max_word_len, output_dim=len(self.TAGS), word_embd_weights=self.word_embd_weights) self.model.summary() if os.path.exists(self.MODEL_PATH): print("loading model from: ", self.MODEL_PATH) self.model.load_weights(self.MODEL_PATH, by_name=True) else: print('Fitting...') self.model.fit_generator( generator=self.batch_generator(self.train_sentences, self.train_taggings, train_steps), steps_per_epoch=train_steps, epochs=self.EPOCHS, validation_data=self.batch_generator(self.valid_sentences, self.valid_taggings, valid_steps), validation_steps=valid_steps, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=2), keras.callbacks.EarlyStopping( monitor='val_categorical_accuracy', patience=2), ], verbose=True, ) self.model.save_weights(self.MODEL_PATH)
return Model( inputs=[premise_word_input_layer, hypothesis_word_input_layer], outputs=x) training = get_data('../data/snli_1.0_train.jsonl') validation = get_data('../data/snli_1.0_dev.jsonl') test = get_data('../data/snli_1.0_test.jsonl') sentences = training[0] + training[1] + validation[0] + validation[1] + test[ 0] + test[1] from keras_wc_embd import get_dicts_generator dict_generator = get_dicts_generator( word_min_freq=1, char_min_freq=1, word_ignore_case=False, char_ignore_case=False, ) for sentence in sentences: dict_generator(sentence) word_dict, char_dict, _ = dict_generator(return_dict=True) def get_input(sentences, word_unknown=1, char_unknown=1, word_ignore_case=False, char_ignore_case=False): sentence_num = len(sentences) max_sentence_len = params['max_length']
if parts[0] != '-DOCSTART-': sentences[-1].append(parts[0]) taggings[-1].append(TAGS[parts[-1]]) if not sentences[-1]: sentences.pop() taggings.pop() return sentences, taggings print('Loading...') train_sentences, train_taggings = load_data(DATA_TRAIN_PATH) valid_sentences, valid_taggings = load_data(DATA_VALID_PATH) dicts_generator = get_dicts_generator( word_min_freq=2, char_min_freq=2, word_ignore_case=True, char_ignore_case=False ) for sentence in train_sentences: dicts_generator(sentence) word_dict, char_dict, max_word_len = dicts_generator(return_dict=True) if os.path.exists(WORD_EMBD_PATH): print('Embedding...') word_dict = { '': 0, '<UNK>': 1, } with codecs.open(WORD_EMBD_PATH, 'r', 'utf8') as reader: for line in reader: line = line.strip()