import torch from torch.utils.data import Dataset from vocabulary import Vocabulary # this is what i want: ( word1, word2....), (tag1, tag2.... ) voc = Vocabulary.build_corpus('train') class ResumeData(Dataset): def __init__(self, voc): self.voc = voc self.word_lists = voc.word_lists self.tag_lists = voc.tag_lists def __getitem__(self, item): # 获取每个字对应的id # TODO: 对UNK_TOKEN 进行处理 _word_list = self.word_lists[item] word_list = [self.voc.word2id[word] for word in _word_list] # 获取每个tag对应的id _tag_list = self.tag_lists[item] tag_list = [self.voc.tag2id[tag] for tag in _tag_list] return torch.tensor(word_list, dtype=torch.long), torch.tensor(tag_list, dtype=torch.long) def __len__(self): return len(self.word_lists)
if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--data', default='train', help='Choose data type to train your model', choices=['train', 'test', 'dev']) parser.add_argument('--epoch', default=10, help='Epochs to train your model') parser.add_argument('--load_model_name', type=str, help='If wanna load model stats before trainning') parser.add_argument('--save_model_name', type=str, help='Directory to save your model') parser.add_argument('--save_every', type=int, default=1, help='After n epoch to save you model, \ make sure you had type in save_model_name param first' ) args = parser.parse_args() print(f'we are gonna use the following arguments:\n{args.__dict__}\n') voc = Vocabulary.build_corpus(args.data) data = ResumeData(voc) model = BiLSTM_CRF(len(voc.word2id), voc.tag2id, 100, 100) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) train(args.epoch, model, optimizer, data, args.load_model_name, args.save_model_name, args.save_every)