Пример #1
0
def main():
    # define parser and arguments
    args = get_train_test_args()
    util.set_seed(args.seed)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    DistilBert = DistilBertModel.from_pretrained('distilbert-base-uncased')
    Experts = [DistilBertQA(DistilBertModel.from_pretrained('distilbert-base-uncased')).to(device) for _ in range(args.num_experts)]
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    gate_model = GateNetwork(384, 3,3, DistilBert.config).to(device)
    print(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = device
        trainer = train.Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=1,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(Experts, gate_model, train_loader, val_loader, val_dict, args.num_experts)
    if args.do_eval:
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = train.Trainer(args, log)
        # load model
        restore_model("",args.num_experts, Experts, gate_model)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=1,
                                 sampler=SequentialSampler(eval_dataset))
        args.device = device
        eval_preds, eval_scores = trainer.evaluate(Experts, gate_model, eval_loader,
                                                   eval_dict, return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #2
0
def main():
    torch.autograd.set_detect_anomaly(True)
    # define parser and arguments
    args = get_train_test_args()
    util.set_seed(args.seed)
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    if args.model_dir == 'save/xxx':
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
    else:
        args.save_dir = args.model_dir

    if args.train_adv:
        trainer = AdvTrainer(args)
    else:
        trainer = Trainer(args)

    print("Preparing Training Data...")
    train_datasets = []
    for dataset_idx, train_dataset_name in enumerate(args.train_datasets.split(",")):
        train_dataset, _ = get_dataset(args, train_dataset_name, args.train_dir, tokenizer, 'train', dataset_idx)
        train_datasets.append(train_dataset)
    train_loader = DataLoader(ConcatDataset(train_datasets),
                          batch_size=args.batch_size,
                          shuffle=True)

    print("Preparing ind Validation Data...")
    ind_val_dataset, ind_val_dict = get_dataset(args, args.train_datasets.split(","), args.ind_val_dir, tokenizer, 'ind_val')
    ind_val_loader = DataLoader(ind_val_dataset,
                                batch_size=args.batch_size,
                                shuffle=False)

    print("Preparing ood Validation Data...")
    ood_val_loaders = []
    ood_val_dicts = []
    ood_val_names = []
    for ood_val_dataset_name in args.ood_val_datasets.split(","):
        ood_val_dataset, ood_val_dict = get_dataset(args, ood_val_dataset_name, args.ood_val_dir, tokenizer, 'ood_val')
        ood_val_loader = DataLoader(ood_val_dataset,
                                    batch_size=args.batch_size,
                                    shuffle=False)
        ood_val_loaders.append(ood_val_loader)
        ood_val_dicts.append(ood_val_dict)
        ood_val_names.append(ood_val_dataset_name)
    trainer.train(train_loader, ind_val_loader, ind_val_dict, ood_val_loaders, ood_val_dicts, ood_val_names, args.resume_iters)
Пример #3
0
def main():

    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)

    if args.mixture_of_experts and args.do_eval:
        model = MoE(load_gate=True)
        experts = True
        model.gate.eval()

    elif args.mixture_of_experts and args.do_train:
        model = MoE(load_gate=False)
        experts = True

    else:
        model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
        experts = False

    if args.reinit > 0:
        transformer_temp = getattr(model, 'distilbert')
        for layer in transformer_temp.transformer.layer[-args.reinit:]:
            for module in layer.modules():
                print(type(module))
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=transformer_temp.config.initializer_range)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict, experts)
    if args.do_eval:
        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        if args.mixture_of_experts is False:
            checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
            model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
            model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
    
        eval_preds, eval_scores = trainer.evaluate(model, eval_loader,
                                                   eval_dict, return_preds=True,
                                                   split=split_name, MoE = True)

        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #4
0
def main():
    # Get command-line args and set seed
    args = get_train_test_args()
    util_adversarial.set_seed(args.seed)

    # Load model
    model = AdversarialModel(args)
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        # Make /save directory
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util_adversarial.get_save_dir(args.save_dir,
                                                      args.run_name)

        # Get logger
        log = util_adversarial.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')

        # Set the device to cuda if GPU available
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        # Load training data
        log.info("Preparing Training Data...")
        train_dataset, train_dict = data_utils.get_dataset(
            args, args.train_datasets, args.train_dir, tokenizer, 'train')
        train_loader = DataLoader(
            train_dataset,
            batch_size=args.
            batch_size,  # batches the examples into groups of 16
            sampler=RandomSampler(train_dataset))
        # Load validation data
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = data_utils.get_dataset(args,
                                                       args.train_datasets,
                                                       args.val_dir, tokenizer,
                                                       'val')
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))

        # Train!!!
        trainer = Trainer(args, log)
        trainer.train(model, train_loader, val_loader, val_dict)

    if args.continue_to_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}')

        # Load model
        model.to(args.device)

        # Load eval data
        eval_dataset, eval_dict = data_utils.get_dataset(
            args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))

        # Evaluate!!!
        trainer = Trainer(args, log)
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   data_loader=eval_loader,
                                                   data_dict=eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval in continue_to_eval {results_str}')

        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])

    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}')
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        checkpoint_path_qa_output = os.path.join(args.save_dir,
                                                 'qa_output_state')

        # Load model
        model = AdversarialModel(args, load_path=args.saved_model_filename)
        # model.load(checkpoint_path)
        # model.load_qa_output_model(checkpoint_path_qa_output)
        model.to(args.device)

        # Load eval data
        eval_dataset, eval_dict = data_utils.get_dataset(
            args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))

        # Evaluate!!!
        trainer = Trainer(args, log)
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   data_loader=eval_loader,
                                                   data_dict=eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')

        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #5
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        if args.resume_training:
            checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
            model = DistilBertForQuestionAnswering.from_pretrained(
                checkpoint_path)
            model.to(args.device)
        else:
            args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train',
                                       args.outdomain_data_repeat)
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val',
                                            args.outdomain_data_repeat)
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        discriminator_input_size = 768
        if args.full_adv:
            discriminator_input_size = 384 * 768
        discriminator = DomainDiscriminator(
            input_size=discriminator_input_size)
        # discriminator.load_state_dict(torch.load(checkpoint_path + '/discriminator'))
        model.to(args.device)
        discriminator.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name,
                                              args.outdomain_data_repeat)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   discriminator,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #6
0
def main():
    # define parser and arguments
    args = get_train_test_args()
    util.set_seed(args.seed)
    # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    model = DomainQA(args.num_classes, args.hidden_size, args.num_layers,
                     args.dropout, args.dis_lambda, args.concat, args.anneal)
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        if args.load_weights != '':
            args.load_weights = os.path.join(args.load_weights, 'checkpoint',
                                             model.WEIGHTS_NAME)
            model.load_state_dict(torch.load(args.load_weights))
        if args.load_distilbert_weights != '':
            # args.load_distilbert_weights = os.path.join(args.load_distilbert_weights, 'checkpoint', model.WEIGHTS_NAME)
            args.load_distilbert_weights = os.path.join(
                args.load_distilbert_weights, 'checkpoint',
                'pytorch_model.bin')
            model.distilbert.load_state_dict(
                torch.load(args.load_distilbert_weights))
            print('loaded pretrained distilbert weights from',
                  args.load_distilbert_weights)

        trainer = Trainer(args, log, model)
        #target_data_dir, target_dataset, tokenizer, split_name, source_data_dir = None, source_dataset = None
        train_dataset, _ = get_train_dataset(args, \
                                       args.target_train_dir,\
                                       args.target_train_datasets,\
                                       tokenizer, 'train', \
                                       source_data_dir=args.source_train_dir, \
                                       source_dataset=args.source_train_datasets)
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, \
                                       args.eval_datasets,\
                                       args.eval_dir,\
                                       tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log, model)
        config_path = os.path.join(args.save_dir, 'checkpoint', 'config.json')
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint',
                                       model.WEIGHTS_NAME)
        model.load_state_dict(torch.load(checkpoint_path))
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #7
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')
    '''###'''
    # if args.reinit_pooler:
    #     encoder_temp = getattr(model, "distilbert")  # Equivalent to model.distilbert
    #     encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range)
    #     encoder_temp.pooler.dense.bias.data.zero_()  # The change of encoder_temp would affect the model
    #     for p in encoder_temp.pooler.parameters():
    #         p.requires_grad = True

    if args.reinit_layers > 0:
        import torch.nn as nn
        from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN
        # model_distilbert = getattr(model, "distilbert")  # model.distilbert; change of model_distilbert affects model!
        # Reinitialization for the last few layers
        for layer in model.distilbert.transformer.layer[-args.reinit_layers:]:
            for module in layer.modules():
                # print(module)
                model.distilbert._init_weights(
                    module)  # It's the line equivalent to below approach
                # if isinstance(module, nn.modules.linear.Linear):  # Original form for nn.Linear
                #     # model.config.initializer_range == model.distilbert.config.initializer_range => True
                #     module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                #     if module.bias is not None:
                #         module.bias.data.zero_()
                # elif isinstance(module, nn.modules.normalization.LayerNorm):
                #     module.weight.data.fill_(1.0)
                #     module.bias.data.zero_()
                # elif isinstance(module, FFN):
                #     for param in [module.lin1, module.lin2]:
                #         param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                #         if param.bias is not None:
                #             param.bias.data.zero_()
                # elif isinstance(module, MultiHeadSelfAttention):
                #     for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]:
                #         param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                #         if param.bias is not None:
                #             param.bias.data.zero_()

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            sampler=RandomSampler(
                train_dataset))  # For squad: 50537/16~3159 items/batches
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(
            checkpoint_path)  # Trained model
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #8
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)

    #### Change Made By Xuran Wang: Comment out original lines #######

    # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    #### Change End #######



    #### Change Made By Xuran Wang: Add custom lines #######

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    finetuned_model_path = 'save/baseline-01/'

    #### Change End #######


    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")


        #### Change Made By Xuran Wang: Add custom lines #######

        checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)

        #### Change End #######

        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(args.device)

        trainer = Trainer(args, log)

        #### Change Made By Xuran Wang: Add custom lines, comment out original line #######

        # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')

        train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction)

         #### Change End #######

        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                batch_size=args.batch_size,
                                sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model, eval_loader,
                                                   eval_dict, return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #9
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        args.model_checkpoint)
    tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint)
    with wandb.init(project="qa-system", config=args) as run:
        run.name = args.run_name
        wandb.watch(model)
        if args.do_train:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
            log = util.get_logger(args.save_dir, 'log_train')
            log.info(
                f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
            log.info("Preparing Training Data...")
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            trainer = Trainer(args, log)
            train_dataset, _ = get_dataset(args, args.train_datasets,
                                           args.train_dir, tokenizer, 'train')
            log.info("Preparing Validation Data...")
            val_dataset, val_dict = get_dataset(args, args.val_datasets,
                                                args.val_dir, tokenizer, 'val')
            train_loader = DataLoader(train_dataset,
                                      batch_size=args.batch_size,
                                      sampler=RandomSampler(train_dataset))
            val_loader = DataLoader(val_dataset,
                                    batch_size=args.batch_size,
                                    sampler=SequentialSampler(val_dataset))
            best_scores = trainer.train(model, train_loader, val_loader,
                                        val_dict)
            model_artifact = wandb.Artifact(
                args.run_name,
                type="model",
            )
            model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint'))
            run.log_artifact(model_artifact)

        if args.do_eval:
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            split_name = 'test' if 'test' in args.eval_dir else 'validation'
            log = util.get_logger(args.save_dir, f'log_{split_name}')
            trainer = Trainer(args, log)
            if args.checkpoint_path != "":
                model = DistilBertForQuestionAnswering.from_pretrained(
                    args.checkpoint_path)
            else:
                checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
                model = DistilBertForQuestionAnswering.from_pretrained(
                    checkpoint_path)
            model.to(args.device)
            eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                                  args.eval_dir, tokenizer,
                                                  split_name)
            eval_loader = DataLoader(eval_dataset,
                                     batch_size=args.batch_size,
                                     sampler=SequentialSampler(eval_dataset))
            eval_preds, eval_scores = trainer.evaluate(model,
                                                       eval_loader,
                                                       eval_dict,
                                                       return_preds=True,
                                                       split=split_name)
            results_str = ', '.join(f'{k}: {v:05.2f}'
                                    for k, v in eval_scores.items())
            log.info(f'Eval {results_str}')
            # Write submission file
            sub_path = os.path.join(args.save_dir,
                                    split_name + '_' + args.sub_file)
            log.info(f'Writing submission file to {sub_path}...')
            with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
                csv_writer = csv.writer(csv_fh, delimiter=',')
                csv_writer.writerow(['Id', 'Predicted'])
                for uuid in sorted(eval_preds):
                    csv_writer.writerow([uuid, eval_preds[uuid]])
Пример #10
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    if args.do_finetune:
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        for name, param in model.named_parameters():
            if name.startswith("distilbert.embeddings."):
                param.requires_grad = False
            for i in range(args.freeze_layer):
                if name.startswith("distilbert.transformer.layer.%s." % i):
                    param.requires_grad = False
        return
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])