Exemplo n.º 1
0
def train_and_val():
    embedding_dim = 100
    hidden_dim = 100
    model_load_path = None
    best_model_save_path = 'model/model_100_best_0223.pth'
    max_score = 0
    stop_epoch = 30
    unimprove_time = 0
    val_json_path = '/home/agwave/Data/resume/val_0222.json'
    val_pdf_dir = '/home/agwave/Data/resume/val_0222/'

    training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH)
    with open('supporting_document/train_word_to_tag_0223.json', 'r') as j:
        word_to_ix = json.load(j)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38}
    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    start_epoch = 0
    if model_load_path != None:
        print('load model...')
        checkpoint = torch.load(model_load_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir)
    print('preliminary score:', preliminary_score)

    for epoch in range(start_epoch, stop_epoch):
        print("---------------------")
        print("running epoch : ", epoch)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        if cur_epoch_score > max_score:
            unimprove_time = 0
            max_score = cur_epoch_score
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch
            }, best_model_save_path)
            print('save best model successfully.')
        else:
            break
Exemplo n.º 2
0
def train_all_data():
    embedding_dim = 100
    hidden_dim = 100
    stop_epoch = 1
    model_1_epoch = 'model/model_1_epoch_lr0001.pth'

    training_data = get_data_from_data_txt(DATA_PERFECT_PATH)
    word_to_ix = get_word_to_ix(training_data, min_word_freq=1)
    tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5,
                 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11,
                 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17,
                 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23,
                 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29,
                 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35,
                 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41,
                 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47}

    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Make sure prepare_sequence from earlier in the LSTM section is loaded
    for epoch in range(
            stop_epoch):  # again, normally you would NOT do 300 epochs, it is toy data
        print("---------------------")
        print("running epon : ", epoch + 1)
        start_time = time.time()
        for sentence, tags in tqdm(training_data):
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 15)
            optimizer.step()
        cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR)
        print('score', cur_epoch_score)
        print('running time:', time.time() - start_time)
        print()
        if epoch == stop_epoch:
            torch.save({
                'model_state_dict': model.state_dict()
            }, model_1_epoch)
Exemplo n.º 3
0
import torch.optim as optim
from dataset import Dataset
from model import BiLSTM_CRF

# torch.set_default_tensor_type('torch.cuda.FloatTensor')

epochs = 100
dataset = Dataset()
train_loader = dataset.get_train_loader(1)
model = BiLSTM_CRF(dataset.get_vocab_size(), dataset.get_label_index_dict(),
                   128, 128)

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

model.train()
for epoch in range(epochs):
    for iter, batch in enumerate(train_loader):
        sentence_in, targets = batch.line, batch.label

        sentence_in = sentence_in.permute([1, 0]).reshape(-1).contiguous()
        targets = targets.permute([1, 0]).reshape(-1).contiguous()

        model.zero_grad()
        loss = model.neg_log_likelihood(sentence_in.squeeze(-1),
                                        targets.squeeze(-1)) / len(sentence_in)

        loss.backward()
        optimizer.step()

        print("{}-{}: {:.5f}".format(epoch, iter, loss.item()))
Exemplo n.º 4
0
def train(conf):
    train_sentences = load_sentences(conf.train_file, conf.zeros)
    dev_sentences = load_sentences(conf.dev_file, conf.zeros)
    test_sentences = load_sentences(conf.test_file, conf.zeros)

    dico_chars_train = char_mapping(train_sentences, conf.lower)[0]
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        dico_chars_train.copy(), conf.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in test_sentences])))
    _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 conf.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               conf.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                conf.lower)

    #loading word embeddings
    all_word_embeds = {}
    for i, line in enumerate(codecs.open(conf.emb_file, 'r', 'utf-8')):
        s = line.strip().split()
        if len(s) == conf.embedding_dim + 1:
            all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
    word_embeds_dict = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06),
                                         (len(char_to_id), conf.embedding_dim))
    for w in char_to_id:
        if w in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w]
        elif w.lower() in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w.lower()]
    print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

    train_manager = BatchManager(train_data, conf.batch_size)

    model = BiLSTM_CRF(conf, tag_to_id, char_to_id, word_embeds_dict)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=conf.learning_rate,
                                weight_decay=1e-4)
    epoch = conf.epochs
    dev_f1_ = 0
    for epoch in range(1, epoch + 1):
        print(f'train on epoch {epoch}')
        j = 1
        for batch in train_manager.iter_batch(shuffle=True):
            batch_loss = 0.0
            sentences = batch[1]
            tags = batch[-1]
            for i, index in enumerate(np.random.permutation(len(sentences))):
                model.zero_grad()
                sentence_in = sentences[index]
                tags_in = tags[index]
                loss = model.neg_log_likelihood(sentence_in, tags_in)
                loss.backward()
                optimizer.step()
                batch_loss += loss.data
            print(
                f'[batch {j},batch size:{conf.batch_size}] On this batch loss: {batch_loss}'
            )
            j = j + 1
        print(f'Begin validing result on [epoch {epoch}] valid dataset ...')
        dev_results = get_predictions(model, dev_data, id_to_tag)
        dev_f1 = evaluate_ner(dev_results, conf)
        if dev_f1 > dev_f1_:
            torch.save(model, conf.model_file)
            print('save model success.')
        test_results = get_predictions(model, test_data, id_to_tag)
        test_f1 = evaluate_ner(test_results, conf)
        print(f'[epoch {epoch}] On test dataset] f1: {test_f1:3f}')