예제 #1
0
 def test__map_pos(self):
     morpheme_loader = MorphemesLoader()
     self.assertEqual(morpheme_loader._map_pos('P1'),1)
     self.assertEqual(morpheme_loader._map_pos('P1'), 1)
     self.assertEqual(morpheme_loader._map_pos('P2'), 2)
     self.assertEqual(morpheme_loader._map_pos('P1'), 1)
     self.assertEqual(morpheme_loader._map_pos('P3'), 3)
예제 #2
0
 def test_load_data(self):
     morpheme_loader = MorphemesLoader()
     data = morpheme_loader.load_data()
     self.assertGreater(len(data['train']), 0)
     self.assertEqual(data['train'][0][5:,:].sum(),0) # 5 words, after it only 0s
     self.assertEqual(data['train'][0].shape,(self.max_word_length,self.number_of_morphemes)) #48 morphemes appear in train, max 80 words
     self.assertEqual(data['train'][0][0,1],1) #first word is quotes
예제 #3
0
 def test_load_data_power_set(self):
     morpheme_loader = MorphemesLoader()
     morpheme_loader.use_power_set = True
     data = morpheme_loader.load_data()
     self.assertGreater(len(data['train']), 0)
     self.assertEqual(data['train'][0][5:,].sum(),0) # 5 words, after it only 0s
     self.assertEqual(data['train'][0].shape,(self.max_word_length,)) #49 morphemes, max 80 words
     self.assertEqual(data['train'][0][0],0) #first word is quotes, this time it's a unique key
예제 #4
0
    def test_load_data_min_threshold(self):
        morpheme_loader = MorphemesLoader(min_appearance_threshold=1500)
        data = morpheme_loader.load_data()
        self.assertGreater(len(data['train']), 0)
        self.assertEqual(data['train'][0][5:,:].sum(),0) # 5 words, after it only 0s
        self.assertLess(data['train'][0].shape[1],self.number_of_morphemes) #48 morphemes appear in train, max 80 words
        self.assertEqual(data['train'][0][0,1],1) #first word is quotes, but it almost never appears
        self.assertEqual(data['train'][0][1,0],1) #second is COP, but appears less then the threshold

        self.assertTrue('yyQUOT' in morpheme_loader.pos_mapping)
예제 #5
0
 def test__get_sentence_vector(self):
     morpheme_loader = MorphemesLoader()
     test_string = """0	1	"	_	yyQUOT	yyQUOT	_	1
     1	2	תהיה	היה	COP	COP	gen=F|num=S|per=3	2
     2	3	נקמה	נקמה	NN	NN	gen=F|num=S	3
     3	4	ו	ו	CONJ	CONJ	_	4
     4	5	בגדול	בגדול	RB	RB	_	4
     5	6	.	_	yyDOT	yyDOT	_	5"""
     test_tensor = morpheme_loader._get_sentence_vector(test_string)
     number_of_morphemes_in_sentence = 6
     self.assertEqual(test_tensor.shape,(self.max_word_length,number_of_morphemes_in_sentence+1))
예제 #6
0
    def test__get_sentence_morpheme_map(self):
        morpheme_loader = MorphemesLoader()
        test_string = """0	1	"	_	yyQUOT	yyQUOT	_	1
1	2	תהיה	היה	COP	COP	gen=F|num=S|per=3	2
2	3	נקמה	נקמה	NN	NN	gen=F|num=S	3
3	4	ו	ו	CONJ	CONJ	_	4
4	5	בגדול	בגדול	RB	RB	_	4
5	6	.	_	yyDOT	yyDOT	_	5"""
        test_answer = morpheme_loader._get_sentence_morpheme_map(test_string)
        self.assertEqual(test_answer[0],(set([1])))
        self.assertEqual(test_answer[3],(set([4,5])))
예제 #7
0
 def test__get_sentence_vector_power_set(self):
     morpheme_loader = MorphemesLoader()
     morpheme_loader.use_power_set = True
     test_string = """0	1	"	_	yyQUOT	yyQUOT	_	1
     1	2	תהיה	היה	COP	COP	gen=F|num=S|per=3	2
     2	3	נקמה	נקמה	NN	NN	gen=F|num=S	3
     3	4	ו	ו	CONJ	CONJ	_	4
     4	5	בגדול	בגדול	RB	RB	_	4
     5	6	.	_	yyDOT	yyDOT	_	5"""
     test_tensor = morpheme_loader._get_sentence_vector(test_string)
     self.assertEqual(test_tensor.shape,(self.max_word_length,))
예제 #8
0
 def test__set_to_vec(self):
     morpheme_loader = MorphemesLoader()
     morpheme_loader.max_pos_id=2
     self.assertEqual(list(morpheme_loader._set_to_vec({0,1})),[1,1])
     self.assertEqual(list(morpheme_loader._set_to_vec({})),[0,0])
     self.assertEqual(list(morpheme_loader._set_to_vec({0})),[1,0])
     morpheme_loader.max_pos_id = 3
     self.assertEqual(len(morpheme_loader._set_to_vec({})), 3)
예제 #9
0
 def test__set_to_vec_power_set(self):
     morpheme_loader = MorphemesLoader()
     morpheme_loader.use_power_set = True
     morpheme_loader.max_morpheme_count=2
     self.assertEqual(morpheme_loader._set_to_vec({0,1}),[0])
     self.assertEqual(morpheme_loader._set_to_vec({}),[1])
     self.assertEqual(morpheme_loader._set_to_vec({0}),[2])
     self.assertEqual(morpheme_loader.max_power_set_key,3)
     self.assertEqual(morpheme_loader._set_to_vec({0, 1}), [0])
     self.assertEqual(morpheme_loader.max_power_set_key, 3)
예제 #10
0
파일: train.py 프로젝트: idanbrus/ELMoOnMD
def train(tb_dir: str = 'default',
          positive_weight: float = 3,
          n_epochs: int = 3,
          use_power_set: bool = False,
          min_appearance_threshold: int = 0,
          combine_yy: bool = False,
          lr: float = 1e-4) -> Embedder:
    """
    Train a New ELMo On MD model. The model will be saved under directory "trained models"
    Args:
        tb_dir: A directory for tensorboard files
        positive_weight: weight to give the positive samples in the labeled data
        n_epochs: number of epochs to train the network
        use_power_set: weather to use power set or not
        min_appearance_threshold: minimum POS tag class size
        combine_yy: weather to combine all the punctuation to one class or not
        lr: model's learning rate

    Returns:
        A newly trained ELMo embedder
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # create the pretrained elmo model
    embedder = get_pretrained_elmo()
    elmo_model = embedder.model

    # some training parameters
    max_sentence_length = MorphemesLoader().max_sentence_length

    # create input data
    tokens = TokenLoader().load_data()
    train_w, train_c, train_lens, train_masks, train_text, recover_ind = transform_input(
        tokens['train'], embedder, 32)
    val_w, val_c, val_lens, val_masks, val_text, val_recover_ind = transform_input(
        tokens['dev'], embedder, 8)

    # create MD data
    md_loader = MorphemesLoader(
        use_power_set=use_power_set,
        min_appearance_threshold=min_appearance_threshold,
        combine_yy=combine_yy)
    md_data = md_loader.load_data()
    train_md_labels = split_data(md_data['train'],
                                 recover_ind,
                                 train_lens,
                                 use_power_set=use_power_set)
    val_md_labels = split_data(md_data['dev'],
                               val_recover_ind,
                               val_lens,
                               use_power_set=use_power_set)
    val_md_labels = torch.cat(val_md_labels)
    total_pos_num = md_loader.max_power_set_key if use_power_set else md_loader.max_pos_id

    # create the MD module
    md_model = BiLSTM(n_tags=total_pos_num, device=device, p_dropout=0.0)
    full_model = nn.Sequential(elmo_model, md_model).to(device)

    # create the tensorboard
    path = os.path.join('../../elmo_tb_runs/',
                        tb_dir)  # , str(datetime.datetime.now()))
    writer = SummaryWriter(path)
    global_step = 0

    criterion = nn.CrossEntropyLoss() if use_power_set else \
        nn.BCEWithLogitsLoss(pos_weight=torch.ones(total_pos_num) * positive_weight)  # Binary cross entropy
    optimizer = Adam(md_model.parameters(), lr=lr)

    def validate():
        with torch.no_grad():
            y_pred = []
            for w, c, lens, masks, texts in zip(val_w, val_c, val_lens,
                                                val_masks, val_text):
                output = elmo_model.forward(w.to(device), c.to(device), [
                    masks[0].to(device), masks[1].to(device),
                    masks[2].to(device)
                ])
                output = md_model(output)

                # apply mask
                sentence_mask = masks[0].to(device)[:, :, None].float()
                output = output * sentence_mask

                target = torch.zeros(
                    (output.shape[0], max_sentence_length, total_pos_num))
                target[:, :output.shape[1], :] = output
                y_pred.append(target)
            y_pred = torch.cat(y_pred, dim=0)
            if use_power_set:
                y_pred = nn.Softmax(dim=-1)(y_pred).argmax(dim=-1)
            else:
                y_pred = nn.Sigmoid()(y_pred) > 0.5
            precision, recall, f_score, support = precision_recall_fscore_support(
                val_md_labels.reshape(-1), y_pred.reshape(-1))
            return precision[1], recall[1], f_score[1], support[1]

    for epoch in tqdm(range(n_epochs), desc='epochs', unit='epoch'):
        # mini batches
        for w, c, lens, masks, texts, labels in zip(train_w, train_c,
                                                    train_lens, train_masks,
                                                    train_text,
                                                    train_md_labels):
            optimizer.zero_grad()

            # forward
            w, c, masks = w.to(device), c.to(device), [
                masks[0].to(device), masks[1].to(device), masks[2].to(device)
            ]
            output = elmo_model.forward(w, c, masks)
            output = md_model(output)

            # apply mask
            sentence_mask = masks[0].to(device)[:, :, None].float()
            output = output * sentence_mask

            # pad with zeros to fit the labels
            full_output = torch.zeros(
                (output.shape[0], max_sentence_length, total_pos_num))
            full_output[:, :output.shape[1], :] = output

            # change format if using power set
            full_output = full_output.transpose(
                -2, -1) if use_power_set else full_output

            loss = criterion(full_output, labels)
            loss.backward(retain_graph=True)
            optimizer.step()

            writer.add_scalar('train_loss', loss, global_step=global_step)

            # validation set
            if global_step % 10 == 0:
                output.to('cpu')
                full_output.to('cpu')
                loss.to('cpu')
                precision, recall, f_score, _ = validate()
                writer.add_scalar('validation/Precision',
                                  precision,
                                  global_step=global_step)
                writer.add_scalar('validation/Recall',
                                  recall,
                                  global_step=global_step)
                writer.add_scalar('validation/F_score',
                                  f_score,
                                  global_step=global_step)

            global_step += 1

        # switch to train the ELMO too
        if epoch == 15:
            optimizer = Adam(full_model.parameters(), lr=lr)

    return embedder, md_model
예제 #11
0
 def test__get_pos_and_token_id(self):
     morpheme_loader = MorphemesLoader()
     self.assertEqual(morpheme_loader._get_pos_and_token_id('3	4	ו	ו	CONJ	CONJ	_	4'), ('CONJ',4))
     self.assertEqual(morpheme_loader._get_pos_and_token_id('4	5	בגדול	בגדול	RB	RB	_	4'),('RB',4))
     self.assertEqual(morpheme_loader._get_pos_and_token_id('1	2	תהיה	היה	COP	COP	gen=F|num=S|per=3	2'),('COP',2))
예제 #12
0
 def test_load_data_combine_yy(self):
     morpheme_loader = MorphemesLoader(combine_yy=True)
     data = morpheme_loader.load_data()
     self.assertTrue('yyQUOT' not in morpheme_loader.pos_mapping)
     self.assertTrue('YY' in morpheme_loader.pos_mapping)