def do_inference(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) args.task_name = args.task_name.lower() task = get_glue_task(args.task_name, data_dir=args.data_dir) args.batch_size = args.per_gpu_eval_batch_size * max(1, n_gpus) classifier = TransformerSequenceClassifier.load_model( model_path=args.model_path, model_type=args.model_type, task_type=task.task_type, metric_fn=get_metric_fn(task.name), do_lower_case=args.do_lower_case, load_quantized=args.load_quantized_model, ) classifier.to(device, n_gpus) examples = task.get_dev_examples( ) if args.evaluate else task.get_test_examples() preds = classifier.inference(examples, args.max_seq_length, args.batch_size, evaluate=args.evaluate) with io.open(os.path.join(args.output_dir, "output.txt"), "w", encoding="utf-8") as fw: for p in preds: fw.write("{}\n".format(p))
def do_inference(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) args.batch_size = args.per_gpu_eval_batch_size * max(1, n_gpus) inference_examples = process_inference_input(args.data_file) classifier = NeuralTagger.load_model(model_path=args.model_dir) classifier.to(device, n_gpus) output = classifier.inference(inference_examples, args.b) write_column_tagged_file(args.output_dir + os.sep + "output.txt", output)
def do_training(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) # Set seed set_seed(args.seed, n_gpus) # Prepare GLUE task args.task_name = args.task_name.lower() task = get_glue_task(args.task_name, data_dir=args.data_dir) classifier = TransformerSequenceClassifier( model_type=args.model_type, model_name_or_path=args.model_name_or_path, labels=task.get_labels(), task_type=task.task_type, metric_fn=get_metric_fn(task.name), config_name=args.config_name, tokenizer_name=args.tokenizer_name, do_lower_case=args.do_lower_case, output_path=args.output_dir, device=device, n_gpus=n_gpus) train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpus) train_ex = task.get_train_examples() dev_ex = task.get_dev_examples() train_dataset = classifier.convert_to_tensors(train_ex, args.max_seq_length) dev_dataset = classifier.convert_to_tensors(dev_ex, args.max_seq_length) train_sampler = RandomSampler(train_dataset) dev_sampler = SequentialSampler(dev_dataset) train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.per_gpu_eval_batch_size) total_steps, _ = classifier.get_train_steps_epochs( args.max_steps, args.num_train_epochs, args.per_gpu_train_batch_size, len(train_dataset)) classifier.setup_default_optimizer(weight_decay=args.weight_decay, learning_rate=args.learning_rate, adam_epsilon=args.adam_epsilon, warmup_steps=args.warmup_steps, total_steps=total_steps) classifier.train( train_dl, dev_dl, None, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.per_gpu_train_batch_size, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, max_grad_norm=args.max_grad_norm, logging_steps=args.logging_steps, save_steps=args.save_steps) classifier.save_model(args.output_dir, args=args)
def do_inference(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) args.batch_size = args.per_gpu_eval_batch_size * max(1, n_gpus) inference_examples = process_inference_input(args.data_file) classifier = TransformerTokenClassifier.load_model(model_path=args.model_path, model_type=args.model_type, do_lower_case=args.do_lower_case, load_quantized=args.load_quantized_model) classifier.to(device, n_gpus) output = classifier.inference(inference_examples, args.max_seq_length, args.batch_size) write_column_tagged_file(args.output_dir + os.sep + "output.txt", output)
def do_inference(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) args.task_name = args.task_name.lower() task = get_glue_task(args.task_name, data_dir=args.data_dir) args.batch_size = args.per_gpu_eval_batch_size * max(1, n_gpus) classifier = TransformerSequenceClassifier.load_model( model_path=args.model_path, model_type=args.model_type) classifier.to(device, n_gpus) preds = classifier.inference(task.get_test_examples(), args.batch_size) with io.open(os.path.join(args.output_dir, "output.txt"), "w", encoding="utf-8") as fw: for p in preds: fw.write("{}\n".format(p))
def do_kd_training(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) # Set seed set_seed(args.seed, n_gpus) # prepare data processor = TokenClsProcessor(args.data_dir, tag_col=args.tag_col) train_ex = processor.get_train_examples() dev_ex = processor.get_dev_examples() test_ex = processor.get_test_examples() vocab = processor.get_vocabulary() vocab_size = len(vocab) + 1 num_labels = len(processor.get_labels()) + 1 # create an embedder embedder_cls = MODEL_TYPE[args.model_type] if args.config_file is not None: embedder_model = embedder_cls.from_config(vocab_size, num_labels, args.config_file) else: embedder_model = embedder_cls(vocab_size, num_labels) # load external word embeddings if present if args.embedding_file is not None: emb_dict = load_embedding_file(args.embedding_file) emb_mat = get_embedding_matrix(emb_dict, vocab) emb_mat = torch.tensor(emb_mat, dtype=torch.float) embedder_model.load_embeddings(emb_mat) classifier = NeuralTagger(embedder_model, word_vocab=vocab, labels=processor.get_labels(), use_crf=args.use_crf, device=device, n_gpus=n_gpus) train_batch_size = args.b * max(1, n_gpus) train_dataset = classifier.convert_to_tensors( train_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length) teacher = TransformerTokenClassifier.load_model( model_path=args.teacher_model_path, model_type=args.teacher_model_type) teacher.to(device, n_gpus) teacher_dataset = teacher.convert_to_tensors(train_ex, args.max_sentence_length, False) train_dataset = ParallelDataset(train_dataset, teacher_dataset) train_sampler = RandomSampler(train_dataset) train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if dev_ex is not None: dev_dataset = classifier.convert_to_tensors( dev_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length) dev_sampler = SequentialSampler(dev_dataset) dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.b) if test_ex is not None: test_dataset = classifier.convert_to_tensors( test_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length) test_sampler = SequentialSampler(test_dataset) test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b) if args.lr is not None: opt = classifier.get_optimizer(lr=args.lr) distiller = TeacherStudentDistill(teacher, args.kd_temp, args.kd_dist_w, args.kd_student_w, args.kd_loss_fn) classifier.train(train_dl, dev_dl, test_dl, epochs=args.e, batch_size=args.b, logging_steps=args.logging_steps, save_steps=args.save_steps, save_path=args.output_dir, optimizer=opt if opt is not None else None, distiller=distiller) classifier.save_model(args.output_dir)
def do_training(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) # Set seed args.seed = set_seed(args.seed, n_gpus) # prepare data processor = TokenClsProcessor(args.data_dir) classifier = TransformerTokenClassifier( model_type=args.model_type, model_name_or_path=args.model_name_or_path, labels=processor.get_labels(), config_name=args.config_name, tokenizer_name=args.tokenizer_name, do_lower_case=args.do_lower_case, output_path=args.output_dir, device=device, n_gpus=n_gpus, ) train_ex = processor.get_train_examples(filename=args.train_file_name) if train_ex is None: raise Exception("No train examples found, quitting.") dev_ex = processor.get_dev_examples() test_ex = processor.get_test_examples() train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpus) train_dataset = classifier.convert_to_tensors( train_ex, max_seq_length=args.max_seq_length) train_sampler = RandomSampler(train_dataset) train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) dev_dl = None test_dl = None if dev_ex is not None: dev_dataset = classifier.convert_to_tensors( dev_ex, max_seq_length=args.max_seq_length) dev_sampler = SequentialSampler(dev_dataset) dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.per_gpu_eval_batch_size) if test_ex is not None: test_dataset = classifier.convert_to_tensors( test_ex, max_seq_length=args.max_seq_length) test_sampler = SequentialSampler(test_dataset) test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.per_gpu_eval_batch_size) total_steps, _ = classifier.get_train_steps_epochs( args.max_steps, args.num_train_epochs, args.per_gpu_train_batch_size, len(train_dataset)) classifier.setup_default_optimizer( weight_decay=args.weight_decay, learning_rate=args.learning_rate, adam_epsilon=args.adam_epsilon, warmup_steps=args.warmup_steps, total_steps=total_steps, ) classifier.train( train_dl, dev_dl, test_dl, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.per_gpu_train_batch_size, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, max_grad_norm=args.max_grad_norm, logging_steps=args.logging_steps, save_steps=args.save_steps, ) classifier.save_model(args.output_dir, args=args)
def do_kd_pseudo_training(args): prepare_output_path(args.output_dir, args.overwrite_output_dir) device, n_gpus = setup_backend(args.no_cuda) # Set seed args.seed = set_seed(args.seed, n_gpus) # prepare data processor = TokenClsProcessor( args.data_dir, tag_col=args.tag_col, ignore_token=args.ignore_token ) train_labeled_ex = processor.get_train_examples(filename=args.train_filename) train_unlabeled_ex = processor.get_train_examples(filename=args.unlabeled_filename) dev_ex = processor.get_dev_examples(filename=args.dev_filename) test_ex = processor.get_test_examples(filename=args.test_filename) vocab = processor.get_vocabulary(train_labeled_ex + train_unlabeled_ex + dev_ex + test_ex) vocab_size = len(vocab) + 1 num_labels = len(processor.get_labels()) + 1 # create an embedder embedder_cls = MODEL_TYPE[args.model_type] if args.config_file is not None: embedder_model = embedder_cls.from_config(vocab_size, num_labels, args.config_file) else: embedder_model = embedder_cls(vocab_size, num_labels) # load external word embeddings if present if args.embedding_file is not None: emb_dict = load_embedding_file(args.embedding_file, dim=embedder_model.word_embedding_dim) emb_mat = get_embedding_matrix(emb_dict, vocab) emb_mat = torch.tensor(emb_mat, dtype=torch.float) embedder_model.load_embeddings(emb_mat) classifier = NeuralTagger( embedder_model, word_vocab=vocab, labels=processor.get_labels(), use_crf=args.use_crf, device=device, n_gpus=n_gpus, ) train_batch_size = args.b * max(1, n_gpus) train_labeled_dataset = classifier.convert_to_tensors( train_labeled_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length, ) train_unlabeled_dataset = classifier.convert_to_tensors( train_unlabeled_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length, include_labels=False, ) if args.parallel_batching: # # concat labeled+unlabeled dataset # train_dataset = ConcatTensorDataset(train_labeled_dataset, [train_unlabeled_dataset]) # match sizes of labeled/unlabeled train data for parallel batching larger_ds, smaller_ds = ( (train_labeled_dataset, train_unlabeled_dataset) if len(train_labeled_dataset) > len(train_unlabeled_dataset) else (train_unlabeled_dataset, train_labeled_dataset) ) concat_smaller_ds = smaller_ds while len(concat_smaller_ds) < len(larger_ds): concat_smaller_ds = ConcatTensorDataset(concat_smaller_ds, [smaller_ds]) if len(concat_smaller_ds[0]) == 4: train_unlabeled_dataset = concat_smaller_ds else: train_labeled_dataset = concat_smaller_ds else: train_dataset = CombinedTensorDataset([train_labeled_dataset, train_unlabeled_dataset]) # load saved teacher args if exist if os.path.exists(args.teacher_model_path + os.sep + "training_args.bin"): t_args = torch.load(args.teacher_model_path + os.sep + "training_args.bin") t_device, t_n_gpus = setup_backend(t_args.no_cuda) teacher = TransformerTokenClassifier.load_model( model_path=args.teacher_model_path, model_type=args.teacher_model_type, config_name=t_args.config_name, tokenizer_name=t_args.tokenizer_name, do_lower_case=t_args.do_lower_case, output_path=t_args.output_dir, device=t_device, n_gpus=t_n_gpus, ) else: teacher = TransformerTokenClassifier.load_model( model_path=args.teacher_model_path, model_type=args.teacher_model_type ) teacher.to(device, n_gpus) teacher_labeled_dataset = teacher.convert_to_tensors(train_labeled_ex, args.teacher_max_seq_len) teacher_unlabeled_dataset = teacher.convert_to_tensors( train_unlabeled_ex, args.teacher_max_seq_len, False ) if args.parallel_batching: # # concat teacher labeled+unlabeled dataset # teacher_dataset = ConcatTensorDataset(teacher_labeled_dataset, [teacher_unlabeled_dataset]) # match sizes of labeled/unlabeled teacher train data for parallel batching larger_ds, smaller_ds = ( (teacher_labeled_dataset, teacher_unlabeled_dataset) if len(teacher_labeled_dataset) > len(teacher_unlabeled_dataset) else (teacher_unlabeled_dataset, teacher_labeled_dataset) ) concat_smaller_ds = smaller_ds while len(concat_smaller_ds) < len(larger_ds): concat_smaller_ds = ConcatTensorDataset(concat_smaller_ds, [smaller_ds]) if len(concat_smaller_ds[0]) == 4: teacher_unlabeled_dataset = concat_smaller_ds else: teacher_labeled_dataset = concat_smaller_ds train_all_dataset = ParallelDataset( train_labeled_dataset, teacher_labeled_dataset, train_unlabeled_dataset, teacher_unlabeled_dataset, ) train_all_sampler = RandomSampler(train_all_dataset) # this way must use same batch size for both labeled/unlabeled sets train_dl = DataLoader( train_all_dataset, sampler=train_all_sampler, batch_size=train_batch_size ) else: teacher_dataset = CombinedTensorDataset( [teacher_labeled_dataset, teacher_unlabeled_dataset] ) train_dataset = ParallelDataset(train_dataset, teacher_dataset) train_sampler = RandomSampler(train_dataset) train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if dev_ex is not None: dev_dataset = classifier.convert_to_tensors( dev_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length ) dev_sampler = SequentialSampler(dev_dataset) dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.b) if test_ex is not None: test_dataset = classifier.convert_to_tensors( test_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length ) test_sampler = SequentialSampler(test_dataset) test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b) if args.lr is not None: opt = classifier.get_optimizer(lr=args.lr) distiller = TeacherStudentDistill( teacher, args.kd_temp, args.kd_dist_w, args.kd_student_w, args.kd_loss_fn ) classifier.train( train_dl, dev_dl, test_dl, epochs=args.e, batch_size=args.b, logging_steps=args.logging_steps, save_steps=args.save_steps, save_path=args.output_dir, optimizer=opt if opt is not None else None, best_result_file=args.best_result_file, distiller=distiller, word_dropout=args.word_dropout, ) classifier.save_model(args.output_dir)