def test__map_pos(self): morpheme_loader = MorphemesLoader() self.assertEqual(morpheme_loader._map_pos('P1'),1) self.assertEqual(morpheme_loader._map_pos('P1'), 1) self.assertEqual(morpheme_loader._map_pos('P2'), 2) self.assertEqual(morpheme_loader._map_pos('P1'), 1) self.assertEqual(morpheme_loader._map_pos('P3'), 3)
def test_load_data(self): morpheme_loader = MorphemesLoader() data = morpheme_loader.load_data() self.assertGreater(len(data['train']), 0) self.assertEqual(data['train'][0][5:,:].sum(),0) # 5 words, after it only 0s self.assertEqual(data['train'][0].shape,(self.max_word_length,self.number_of_morphemes)) #48 morphemes appear in train, max 80 words self.assertEqual(data['train'][0][0,1],1) #first word is quotes
def test_load_data_power_set(self): morpheme_loader = MorphemesLoader() morpheme_loader.use_power_set = True data = morpheme_loader.load_data() self.assertGreater(len(data['train']), 0) self.assertEqual(data['train'][0][5:,].sum(),0) # 5 words, after it only 0s self.assertEqual(data['train'][0].shape,(self.max_word_length,)) #49 morphemes, max 80 words self.assertEqual(data['train'][0][0],0) #first word is quotes, this time it's a unique key
def test_load_data_min_threshold(self): morpheme_loader = MorphemesLoader(min_appearance_threshold=1500) data = morpheme_loader.load_data() self.assertGreater(len(data['train']), 0) self.assertEqual(data['train'][0][5:,:].sum(),0) # 5 words, after it only 0s self.assertLess(data['train'][0].shape[1],self.number_of_morphemes) #48 morphemes appear in train, max 80 words self.assertEqual(data['train'][0][0,1],1) #first word is quotes, but it almost never appears self.assertEqual(data['train'][0][1,0],1) #second is COP, but appears less then the threshold self.assertTrue('yyQUOT' in morpheme_loader.pos_mapping)
def test__get_sentence_vector(self): morpheme_loader = MorphemesLoader() test_string = """0 1 " _ yyQUOT yyQUOT _ 1 1 2 תהיה היה COP COP gen=F|num=S|per=3 2 2 3 נקמה נקמה NN NN gen=F|num=S 3 3 4 ו ו CONJ CONJ _ 4 4 5 בגדול בגדול RB RB _ 4 5 6 . _ yyDOT yyDOT _ 5""" test_tensor = morpheme_loader._get_sentence_vector(test_string) number_of_morphemes_in_sentence = 6 self.assertEqual(test_tensor.shape,(self.max_word_length,number_of_morphemes_in_sentence+1))
def test__get_sentence_morpheme_map(self): morpheme_loader = MorphemesLoader() test_string = """0 1 " _ yyQUOT yyQUOT _ 1 1 2 תהיה היה COP COP gen=F|num=S|per=3 2 2 3 נקמה נקמה NN NN gen=F|num=S 3 3 4 ו ו CONJ CONJ _ 4 4 5 בגדול בגדול RB RB _ 4 5 6 . _ yyDOT yyDOT _ 5""" test_answer = morpheme_loader._get_sentence_morpheme_map(test_string) self.assertEqual(test_answer[0],(set([1]))) self.assertEqual(test_answer[3],(set([4,5])))
def test__get_sentence_vector_power_set(self): morpheme_loader = MorphemesLoader() morpheme_loader.use_power_set = True test_string = """0 1 " _ yyQUOT yyQUOT _ 1 1 2 תהיה היה COP COP gen=F|num=S|per=3 2 2 3 נקמה נקמה NN NN gen=F|num=S 3 3 4 ו ו CONJ CONJ _ 4 4 5 בגדול בגדול RB RB _ 4 5 6 . _ yyDOT yyDOT _ 5""" test_tensor = morpheme_loader._get_sentence_vector(test_string) self.assertEqual(test_tensor.shape,(self.max_word_length,))
def test__set_to_vec(self): morpheme_loader = MorphemesLoader() morpheme_loader.max_pos_id=2 self.assertEqual(list(morpheme_loader._set_to_vec({0,1})),[1,1]) self.assertEqual(list(morpheme_loader._set_to_vec({})),[0,0]) self.assertEqual(list(morpheme_loader._set_to_vec({0})),[1,0]) morpheme_loader.max_pos_id = 3 self.assertEqual(len(morpheme_loader._set_to_vec({})), 3)
def test__set_to_vec_power_set(self): morpheme_loader = MorphemesLoader() morpheme_loader.use_power_set = True morpheme_loader.max_morpheme_count=2 self.assertEqual(morpheme_loader._set_to_vec({0,1}),[0]) self.assertEqual(morpheme_loader._set_to_vec({}),[1]) self.assertEqual(morpheme_loader._set_to_vec({0}),[2]) self.assertEqual(morpheme_loader.max_power_set_key,3) self.assertEqual(morpheme_loader._set_to_vec({0, 1}), [0]) self.assertEqual(morpheme_loader.max_power_set_key, 3)
def train(tb_dir: str = 'default', positive_weight: float = 3, n_epochs: int = 3, use_power_set: bool = False, min_appearance_threshold: int = 0, combine_yy: bool = False, lr: float = 1e-4) -> Embedder: """ Train a New ELMo On MD model. The model will be saved under directory "trained models" Args: tb_dir: A directory for tensorboard files positive_weight: weight to give the positive samples in the labeled data n_epochs: number of epochs to train the network use_power_set: weather to use power set or not min_appearance_threshold: minimum POS tag class size combine_yy: weather to combine all the punctuation to one class or not lr: model's learning rate Returns: A newly trained ELMo embedder """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # create the pretrained elmo model embedder = get_pretrained_elmo() elmo_model = embedder.model # some training parameters max_sentence_length = MorphemesLoader().max_sentence_length # create input data tokens = TokenLoader().load_data() train_w, train_c, train_lens, train_masks, train_text, recover_ind = transform_input( tokens['train'], embedder, 32) val_w, val_c, val_lens, val_masks, val_text, val_recover_ind = transform_input( tokens['dev'], embedder, 8) # create MD data md_loader = MorphemesLoader( use_power_set=use_power_set, min_appearance_threshold=min_appearance_threshold, combine_yy=combine_yy) md_data = md_loader.load_data() train_md_labels = split_data(md_data['train'], recover_ind, train_lens, use_power_set=use_power_set) val_md_labels = split_data(md_data['dev'], val_recover_ind, val_lens, use_power_set=use_power_set) val_md_labels = torch.cat(val_md_labels) total_pos_num = md_loader.max_power_set_key if use_power_set else md_loader.max_pos_id # create the MD module md_model = BiLSTM(n_tags=total_pos_num, device=device, p_dropout=0.0) full_model = nn.Sequential(elmo_model, md_model).to(device) # create the tensorboard path = os.path.join('../../elmo_tb_runs/', tb_dir) # , str(datetime.datetime.now())) writer = SummaryWriter(path) global_step = 0 criterion = nn.CrossEntropyLoss() if use_power_set else \ nn.BCEWithLogitsLoss(pos_weight=torch.ones(total_pos_num) * positive_weight) # Binary cross entropy optimizer = Adam(md_model.parameters(), lr=lr) def validate(): with torch.no_grad(): y_pred = [] for w, c, lens, masks, texts in zip(val_w, val_c, val_lens, val_masks, val_text): output = elmo_model.forward(w.to(device), c.to(device), [ masks[0].to(device), masks[1].to(device), masks[2].to(device) ]) output = md_model(output) # apply mask sentence_mask = masks[0].to(device)[:, :, None].float() output = output * sentence_mask target = torch.zeros( (output.shape[0], max_sentence_length, total_pos_num)) target[:, :output.shape[1], :] = output y_pred.append(target) y_pred = torch.cat(y_pred, dim=0) if use_power_set: y_pred = nn.Softmax(dim=-1)(y_pred).argmax(dim=-1) else: y_pred = nn.Sigmoid()(y_pred) > 0.5 precision, recall, f_score, support = precision_recall_fscore_support( val_md_labels.reshape(-1), y_pred.reshape(-1)) return precision[1], recall[1], f_score[1], support[1] for epoch in tqdm(range(n_epochs), desc='epochs', unit='epoch'): # mini batches for w, c, lens, masks, texts, labels in zip(train_w, train_c, train_lens, train_masks, train_text, train_md_labels): optimizer.zero_grad() # forward w, c, masks = w.to(device), c.to(device), [ masks[0].to(device), masks[1].to(device), masks[2].to(device) ] output = elmo_model.forward(w, c, masks) output = md_model(output) # apply mask sentence_mask = masks[0].to(device)[:, :, None].float() output = output * sentence_mask # pad with zeros to fit the labels full_output = torch.zeros( (output.shape[0], max_sentence_length, total_pos_num)) full_output[:, :output.shape[1], :] = output # change format if using power set full_output = full_output.transpose( -2, -1) if use_power_set else full_output loss = criterion(full_output, labels) loss.backward(retain_graph=True) optimizer.step() writer.add_scalar('train_loss', loss, global_step=global_step) # validation set if global_step % 10 == 0: output.to('cpu') full_output.to('cpu') loss.to('cpu') precision, recall, f_score, _ = validate() writer.add_scalar('validation/Precision', precision, global_step=global_step) writer.add_scalar('validation/Recall', recall, global_step=global_step) writer.add_scalar('validation/F_score', f_score, global_step=global_step) global_step += 1 # switch to train the ELMO too if epoch == 15: optimizer = Adam(full_model.parameters(), lr=lr) return embedder, md_model
def test__get_pos_and_token_id(self): morpheme_loader = MorphemesLoader() self.assertEqual(morpheme_loader._get_pos_and_token_id('3 4 ו ו CONJ CONJ _ 4'), ('CONJ',4)) self.assertEqual(morpheme_loader._get_pos_and_token_id('4 5 בגדול בגדול RB RB _ 4'),('RB',4)) self.assertEqual(morpheme_loader._get_pos_and_token_id('1 2 תהיה היה COP COP gen=F|num=S|per=3 2'),('COP',2))
def test_load_data_combine_yy(self): morpheme_loader = MorphemesLoader(combine_yy=True) data = morpheme_loader.load_data() self.assertTrue('yyQUOT' not in morpheme_loader.pos_mapping) self.assertTrue('YY' in morpheme_loader.pos_mapping)