def __init__(self, *paths): self.data = [ list(U.FileReader(path).sents()) for path in paths ] assert all(len(d) == len(self.data[0]) for d in self.data), \ "Not all files have the same length."
def create_dataloader(dataset): return D.MultiSentWordDataLoader(dataset=dataset, input_vocabs=input_vocabs, label_vocabs=label_vocab, batch_size=batch_size, shuffle=shuffle, tensor_lens=True, num_workers=1, pin_memory=True) if __name__ == "__main__": input_vocabs = [] input = input_path vocab = utils.Vocabulary() words = utils.FileReader(input).words() vocab.add("<pad>") vocab.add("<unk>") utils.populate_vocab(words, vocab) input_vocabs.append(vocab) # print(input_vocabs)[<utils.Vocabulary object at 0x7fa839f5a0b8>] label_vocab = utils.Vocabulary() words = utils.FileReader(label_path).words() label_vocab.add("START") label_vocab.add("END") utils.populate_vocab(words, label_vocab) crf = M.CRF(len(label_vocab)) model = M.LSTMCRF(crf=crf, vocab_sizes=[len(v) for v in input_vocabs],
def main(args): logging.basicConfig(level=logging.INFO) check_arguments(args) logging.info("Creating vocabulary...") input_vocabs = [] for input in args.input_path: vocab = utils.Vocabulary() words = utils.FileReader(input).words() vocab.add("<pad>") vocab.add("<unk>") utils.populate_vocab(words, vocab) input_vocabs.append(vocab) # print(input_vocabs)[<utils.Vocabulary object at 0x7fa839f5a0b8>] label_vocab = utils.Vocabulary() words = utils.FileReader(args.label_path).words() label_vocab.add("START") label_vocab.add("END") utils.populate_vocab(words, label_vocab) for i, input_vocab in enumerate(input_vocabs): vocab_path = os.path.join(args.save_dir, "vocab-input{}.pkl".format(i + 1)) pickle.dump(input_vocab, open(vocab_path, "wb")) vocab_path = os.path.join(args.save_dir, "vocab-label.pkl") pickle.dump(label_vocab, open(vocab_path, "wb")) logging.info("Initializing model...") crf = M.CRF(len(label_vocab)) print('args.word_dim==',args.word_dim,type(args.word_dim)) model = M.LSTMCRF( crf=crf, vocab_sizes=[len(v) for v in input_vocabs], word_dims=args.word_dim, hidden_dim=args.lstm_dim, layers=args.lstm_layers, dropout_prob=args.dropout_prob, bidirectional=args.bidirectional ) model.reset_parameters() if args.gpu: gpu_main = args.gpu[0] model = model.cuda(gpu_main) params = sum(np.prod(p.size()) for p in model.parameters()) logging.info("Number of parameters: {}".format(params)) logging.info("Loading word embeddings...") # for vocab, we_type, we_path, we_freeze, emb in \ # zip(input_vocabs, args.wordembed_type, args.wordembed_path, # args.wordembed_freeze, model.embeddings): # if we_type == "glove": # assert we_path is not None # load_glove_embeddings(emb, vocab, we_path) # elif we_type == "fasttext": # assert we_path is not None # assert args.fasttext_path is not None # load_fasttext_embeddings(emb, vocab, # fasttext_path=args.fasttext_path, # embedding_path=we_path) # elif we_type == "none": # pass # else: # raise ValueError("Unrecognized word embedding " # "type: {}".format(we_type)) # # if we_freeze: # emb.weight.requires_grad = False # Copying configuration file to save directory if config file is specified. if args.config: config_path = os.path.join(args.save_dir, os.path.basename(args.config)) shutil.copy(args.config, config_path) def create_dataloader(dataset): return D.MultiSentWordDataLoader( dataset=dataset, input_vocabs=input_vocabs, label_vocabs=label_vocab, batch_size=args.batch_size, shuffle=args.shuffle, tensor_lens=True, num_workers=len(args.gpu) if args.gpu is not None else 1, pin_memory=True ) dataset = D.MultiSentWordDataset(*args.input_path, args.label_path) test_dataset = D.MultiSentWordDataset(*args.test_input_path, args.test_label_path) if args.val: vr = args.val_ratio val_dataset, _ = dataset.split(vr, 1-vr, shuffle=args.shuffle) else: val_dataset = None train_dataset = dataset train_dataloader = create_dataloader(train_dataset) test_dataloader = create_dataloader(test_dataset) if val_dataset is not None: val_dataloader = create_dataloader(val_dataset) else: val_dataloader = None print(input_vocabs,type(input_vocabs)) logging.info("Beginning training...") trainer = LSTMCRFTrainer( sargs=args, input_vocabs=input_vocabs, label_vocab=label_vocab, val_data=val_dataloader, model=model, epochs=args.epochs, gpus=args.gpu ) trainer.train(train_dataloader, data_size=len(train_dataset)) # trainer.validate() logging.info("Beginning testing...") # trainer.test(train_dataloader, data_size=len(train_dataset)) #trainer.test(test_dataloader, data_size=len(test_dataset)) logging.info("Done!")