예제 #1
0
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, mode,prefix=global_step)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

if __name__ == '__main__':

    mode = 'lilbert'
    args = Arguments()
    numclasses = 3

    if mode != 'lilbert':
        teacher, tok = lilbert.get_bert()
        # Init custom Bert Classifier model.
        dim = 768
        model = lilbert.BertClassifier(teacher, 768, numclasses)
        main(model,tok,mode)
    else:
        # for now lilbert without any attention distillation
        _, tok = lilbert.get_bert()
        teacher = torch.load(args.output_dir+args.model_name)
        student, _ = lilbert.get_bert(teacher.bert)
        student = lilbert.make_lil_bert(student, dim=420, vanilla=True)
        # teacher = lilbert.BertClassifier(teacher, 768, numclasses)
        student = lilbert.BertClassifier(student, 420, numclasses)
        # m = lilbert.BertDistillModel(teacher, student, alpha=2)
        m = lilbert.BertDistillWithAttentionModel(teacher, student, alpha=0.5)
        main(m, tok, mode)
예제 #2
0
    args.logging_loss_steps = dataset_logging[args.task_name]
    args.logging_steps = dataset_logging[args.task_name]

    if args.save:
        assert args.only_teacher is True and args.mode == 'loss_in_train_loop', "the codebase only " \
                                                                                "supports saving teacher. To train " \
                                                                                "teacher set only_teacher args to true"
    # number of classes for a given dataset
    args.numclasses = GLUE_TASKS_NUM_LABELS[args.task_name.lower()]

    if args.mode == 'loss_in_train_loop':

        if args.only_teacher:
            # Snippet to make a bert classifier with given dim -1
            teacher, tok = lilbert.get_bert()
            model = lilbert.BertClassifier(teacher, args.teacher_dim,
                                           args.numclasses)

        else:
            # Snippet to make a bert classifier with lesser dim - 5,6
            teacher, tok = lilbert.get_bert()
            student = lilbert.make_lil_bert(
                teacher.bert,
                dim=args.student_dim,
                method="cut",
                vanilla=args.from_scratch
            )  # This can TAKE BOTH TEACHER AS WELL AS TEACHER.BERT
            model = lilbert.BertClassifier(student, args.student_dim,
                                           args.numclasses)

    elif args.mode == 'loss_in_model':
예제 #3
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--model_type", default=None, type=str, required=True,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--task_name", default=None, type=str, required=True,
                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after tokenization. Sequences longer "
                             "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")

    # added here
    parser.add_argument('--neuron_prune', type=int, default=-1, help="Do neuron pruning after training (default: -1 = means no pruning)")
    parser.add_argument('--gpu', type=int, default=-1, help="Which GPU to run on")
    parser.add_argument('--finetune_last', type=int, default=-1, help="How many last layers to finetune (if != -1, embeddings are always frozen, given number must be < number of encoder layers)")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.gpu != -1:
        device = torch.device("cuda", args.gpu)
        args.n_gpu = 1
    elif args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    bert, tokenizer = lilbert.get_bert("bert-base-uncased")
    bertc = lilbert.BertClassifier(bert, 768, num_labels)
    bertc = lilbert.make_lil_bert_cut(bertc, 420, vanilla=False)
    bertm = lilbert.BertClassifierModel(bertc)

    if args.finetune_last > -1:
        bertc.bert.embeddings._no_grad = True
        num_bert_layers = len(bertc.bert.encoder.layer)
        for i in range(0, num_bert_layers - args.finetune_last):
            bertc.bert.encoder.layer[i]._no_grad = True

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    bertm.to(args.device)

    logger.info("Training/evaluation parameters %s", args)


    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, bertm, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # TODO: saving model

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        result = evaluate(args, bertm, tokenizer, prefix=global_step)
        result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

    # neuron prune
    if args.neuron_prune > -1:
        assert(args.do_train)
        bertm_ = train_neuronprune(args, train_dataset, bertm, tokenizer)

        # Evaluation
        results = {}
        if args.do_eval and args.local_rank in [-1, 0]:
            result = evaluate(args, bertm_, tokenizer, prefix=global_step)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)


    return results