def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    # prepare examples, load model as encoder
    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path,
                                                      relaxed=True)

    # Load Model...
    if args.bert_load_mode == "state_model_only":
        state_dict = all_state['model']
        bert_as_encoder = BertModel.from_state_dict(
            config_file=args.bert_config_json_path, state_dict=state_dict)
    else:
        assert args.bert_load_mode == "from_pretrained"
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
            args.local_rank)
        bert_as_encoder = BertModel.from_pretrained(
            pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir)

    bert_as_encoder.to(device)

    runner_param = RunnerParameters(
        max_seq_length=args.max_seq_length,
        local_rank=args.local_rank,
        n_gpu=n_gpu,
        fp16=args.fp16,
        learning_rate=args.learning_rate,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        t_total=None,
        warmup_proportion=args.warmup_proportion,
        num_train_epochs=args.num_train_epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_size=args.eval_batch_size,
    )

    runner = EmbeddingTaskRunner(bert_model=bert_as_encoder,
                                 optimizer=None,
                                 tokenizer=tokenizer,
                                 label_list=task.get_labels(),
                                 device=device,
                                 rparams=runner_param)

    # Run training set encoding...
    print("Run training set encoding ... ")
    train_examples = task.get_train_examples()
    train_dataset = runner.run_encoding(train_examples,
                                        verbose=True,
                                        mode='train')
    print("saving embeddings ... ")
    torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset"))

    # Run development set encoding ...
    eval_examples = task.get_dev_examples()
    eval_dataset = runner.run_encoding(eval_examples,
                                       verbose=True,
                                       mode='eval')
    print("saving embeddings ... ")
    torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset'))

    # Run test set encoding ...
    test_examples = task.get_test_examples()
    test_dataset = runner.run_encoding(test_examples,
                                       verbose=True,
                                       mode='test')
    print("saving embeddings ... ")
    torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset"))

    # HACK for MNLI mis-matched set ...
    if args.task_name == 'mnli':
        print("=== Start embedding task for MNLI mis-matched ===")
        mm_eval_examples = MnliMismatchedProcessor().get_dev_examples(
            task.data_dir)
        mm_eval_dataset = runner.run_encoding(mm_eval_examples,
                                              verbose=True,
                                              mode='eval')
        print("=== Saving eval dataset ===")
        torch.save(mm_eval_dataset,
                   os.path.join(args.output_dir, "mm_dev.dataset"))
        print("=== Saved ===")

        mm_test_examples = MnliMismatchedProcessor().get_test_examples(
            task.data_dir)
        mm_test_dataset = runner.run_encoding(mm_test_examples,
                                              verbose=True,
                                              mode='test')
        print("=== Saving tensor dataset ===")
        torch.save(mm_test_dataset,
                   os.path.join(args.output_dir, "mm_test.dataset"))
        print("=== Saved ===")
예제 #2
0
def prepare_train_data(args):
    print('Preparing net training data...')

    tokenizer = AutoTokenizer.from_pretrained(args.model)

    net_task = get_task(args.task_name, args.dataroot)

    net_examples = net_task.get_train_examples()

    net_label_list = net_task.get_labels()
    net_label_map = {label: i for i, label in enumerate(net_label_list)}

    net_input_ids = []
    net_input_masks = []
    net_segment_ids = []
    net_label_ids = []

    for (ex_index, example) in enumerate(net_examples):
        net_input_id, net_input_mask, net_segment_id, net_label_id = \
            convert_example_to_feature(example, tokenizer, args.max_seq_length, net_label_map)
        net_input_ids.append(net_input_id)
        net_input_masks.append(net_input_mask)
        net_segment_ids.append(net_segment_id)
        net_label_ids.append(net_label_id)

    net_input_ids = torch.tensor(net_input_ids)
    net_input_masks = torch.tensor(net_input_masks)
    net_segment_ids = torch.tensor(net_segment_ids)
    net_label_ids = torch.tensor(net_label_ids)

    print('Preparing ssh training data...')

    if args.auxiliary_labels == 2:
        ssh_task = get_task('aug-2', args.aug_dataroot)
    elif args.auxiliary_labels == 3:
        ssh_task = get_task('aug-3', args.aug_dataroot)
    else:
        ssh_task = get_task('aug-4', args.aug_dataroot)

    ssh_examples = ssh_task.get_train_examples()

    ssh_label_list = ssh_task.get_labels()
    ssh_label_map = {label: i for i, label in enumerate(ssh_label_list)}

    ssh_input_ids = []
    ssh_input_masks = []
    ssh_segment_ids = []
    ssh_label_ids = []

    for (ex_index, example) in enumerate(ssh_examples):
        ssh_input_id, ssh_input_mask, ssh_segment_id, ssh_label_id = \
            convert_example_to_feature(example, tokenizer, args.max_seq_length, ssh_label_map)
        ssh_input_ids.append(ssh_input_id)
        ssh_input_masks.append(ssh_input_mask)
        ssh_segment_ids.append(ssh_segment_id)
        ssh_label_ids.append(ssh_label_id)

    ssh_input_ids = torch.tensor(ssh_input_ids[:len(net_input_ids)])
    ssh_input_masks = torch.tensor(ssh_input_masks[:len(net_input_masks)])
    ssh_segment_ids = torch.tensor(ssh_segment_ids[:len(net_segment_ids)])
    ssh_label_ids = torch.tensor(ssh_label_ids[:len(net_label_ids)])

    trset = torch.utils.data.TensorDataset(net_input_ids, net_input_masks,
                                           net_segment_ids, net_label_ids,
                                           ssh_input_ids, ssh_input_masks,
                                           ssh_segment_ids, ssh_label_ids)

    trset_sampler = torch.utils.data.RandomSampler(trset)
    trloader = torch.utils.data.DataLoader(trset,
                                           batch_size=args.batch_size,
                                           sampler=trset_sampler,
                                           num_workers=0)
    return trloader
예제 #3
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    # could cause problem because imdb is not part of the tasks defined
    task = get_task(args.task_name, args.data_dir)

    # create tokenizer using given model input
    # I think xlnet also use the same tokenizer
    tokenizer = shared_model_setup.create_tokenizer(
        xlnet_model_name=args.xlnet_model,  # need to change
        xlnet_load_mode=args.xlnet_load_mode,  # need to change
        do_lower_case=args.do_lower_case,
        xlnet_vocab_path=args.xlnet_vocab_path,  # not sure how to modify
    )
    all_state = shared_model_setup.load_overall_state(
        args.xlnet_load_path,
        relaxed=True)  # probably will be the pre-trained one

    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        xlnet_model_name=args.xlnet_model,
        xlnet_load_mode=args.xlnet_load_mode,
        xlnet_load_args=args.xlnet_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        xlnet_config_json_path=args.xlnet_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        if args.train_examples_number is not None:
            train_examples = random_sample(train_examples,
                                           args.train_examples_number)
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )

        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"]
            if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None
    # TODO: what does xlnet runner do???\
    # initial answer: probably do
    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank,
            n_gpu=n_gpu,
            fp16=args.fp16,
            learning_rate=args.learning_rate,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total,
            warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
            eval_batch_size=args.eval_batch_size,
        ))

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(
                    os.path.join(args.output_dir, "val_metrics_history.json"),
                    "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(
                train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(
                        train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_xlnet(
                            model=model,
                            optimizer=optimizer,
                            args=args,
                            save_path=os.path.join(
                                args.output_dir,
                                f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_xlnet(
            model=model,
            optimizer=optimizer,
            args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )
    # remove the hack part for MultiNLI Mismatched dataset
    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples,
                                 task_name=task.name,
                                 verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"),
                  header=False,
                  index=False)
예제 #4
0
def main():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    log_info.print_args(args)

    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)

    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        do_lower_case=args.do_lower_case,
        bert_vocab_path=args.bert_vocab_path,
    )
    all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True)
    model = glue_model_setup.create_model(
        task_type=task.processor.TASK_TYPE,
        bert_model_name=args.bert_model,
        bert_load_mode=args.bert_load_mode,
        bert_load_args=args.bert_load_args,
        all_state=all_state,
        num_labels=len(task.processor.get_labels()),
        device=device,
        n_gpu=n_gpu,
        fp16=args.fp16,
        local_rank=args.local_rank,
        bert_config_json_path=args.bert_config_json_path,
    )
    if args.do_train:
        if args.print_trainable_params:
            log_info.print_trainable_params(model)
        train_examples = task.get_train_examples()
        t_total = shared_model_setup.get_opt_train_steps(
            num_train_examples=len(train_examples),
            args=args,
        )
        optimizer = shared_model_setup.create_optimizer(
            model=model,
            learning_rate=args.learning_rate,
            t_total=t_total,
            loss_scale=args.loss_scale,
            fp16=args.fp16,
            warmup_proportion=args.warmup_proportion,
            state_dict=all_state["optimizer"] if args.bert_load_mode == "state_all" else None,
        )
    else:
        train_examples = None
        t_total = 0
        optimizer = None

    runner = GlueTaskRunner(
        model=model,
        optimizer=optimizer,
        tokenizer=tokenizer,
        label_list=task.get_labels(),
        device=device,
        rparams=RunnerParameters(
            max_seq_length=args.max_seq_length,
            local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16,
            learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps,
            t_total=t_total, warmup_proportion=args.warmup_proportion,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size,
        )
    )

    if args.do_train:
        assert at_most_one_of([args.do_val_history, args.train_save_every])
        if args.do_val_history:
            val_examples = task.get_dev_examples()
            results = runner.run_train_val(
                train_examples=train_examples,
                val_examples=val_examples,
                task_name=task.name,
            )
            metrics_str = json.dumps(results, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics_history.json"), "w") as f:
                f.write(metrics_str)
        elif args.train_save_every:
            train_dataloader = runner.get_train_dataloader(train_examples, verbose=not args.not_verbose)
            for epoch in range(int(args.num_train_epochs)):
                for step, _, _ in runner.run_train_epoch_context(train_dataloader):
                    if step % args.train_save_every == args.train_save_every - 1 \
                            or step == len(train_dataloader) - 1:
                        glue_model_setup.save_bert(
                            model=model, optimizer=optimizer, args=args,
                            save_path=os.path.join(
                                args.output_dir, f"all_state___epoch{epoch:04d}___batch{step:06d}.p"
                            ),
                            save_mode=args.bert_save_mode,
                            verbose=not args.not_verbose,
                        )
        else:
            runner.run_train(train_examples)

    if args.do_save:
        # Save a trained model
        glue_model_setup.save_bert(
            model=model, optimizer=optimizer, args=args,
            save_path=os.path.join(args.output_dir, "all_state.p"),
            save_mode=args.bert_save_mode,
        )

    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples, task_name=task.name, verbose=not args.not_verbose)
        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False)
        metrics_str = json.dumps({"loss": results["loss"], "metrics": results["metrics"]}, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            mm_val_examples = MnliMismatchedProcessor().get_dev_examples(task.data_dir)
            mm_results = runner.run_val(mm_val_examples, task_name=task.name, verbose=not args.not_verbose)
            df = pd.DataFrame(results["logits"])
            df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False)
            combined_metrics = {}
            for k, v in results["metrics"]:
                combined_metrics[k] = v
            for k, v in mm_results["metrics"]:
                combined_metrics["mm-"+k] = v
            combined_metrics_str = json.dumps({
                "loss": results["loss"],
                "metrics": combined_metrics,
            }, indent=2)
            with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
                f.write(combined_metrics_str)

    if args.do_test:
        test_examples = task.get_test_examples()
        logits = runner.run_test(test_examples, verbose=not args.not_verbose)
        df = pd.DataFrame(logits)
        df.to_csv(os.path.join(args.output_dir, "test_preds.csv"), header=False, index=False)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            test_examples = MnliMismatchedProcessor().get_test_examples(task.data_dir)
            logits = runner.run_test(test_examples)
            df = pd.DataFrame(logits)
            df.to_csv(os.path.join(args.output_dir, "mm_test_preds.csv"), header=False, index=False)
        eval_dataloader = DataLoader(
            eval_data,
            sampler=eval_sampler,
            batch_size=self.rparams.eval_batch_size,
        )
        return HybridLoaderSeparated(eval_dataloader, eval_tokens_a,
                                     eval_tokens_b)


if __name__ == "__main__":
    from glue.tasks import get_task
    from shared import model_setup as shared_model_setup
    from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
    from pytorch_pretrained_bert.modeling import BertModel

    task = get_task("wnli", "../../jiant_data/WNLI")
    train_examples = task.get_train_examples()
    label_map = {k: v for v, k in enumerate(task.get_labels())}
    tokenizer = shared_model_setup.create_tokenizer(
        bert_model_name="bert-base-uncased",
        bert_load_mode="from_pretrained",
        do_lower_case=True,
    )
    bert_vocab_path = "../cache/bert_metadata/uncased_L-12_H-768_A-12/vocab.txt"
    train_features = convert_examples_to_features_separated(
        train_examples,
        label_map=label_map,
        max_seq_length=100,
        tokenizer=tokenizer,
        verbose=True)
    train_data, train_tokens_a, train_tokens_b = convert_to_dataset_separated(
예제 #6
0
parser.add_argument('--alpha', default=0.1, type=float)
parser.add_argument('--num_aug', default=1, type=int)
parser.add_argument('--num_type', default=4, type=int)
parser.add_argument('--task_name', default='CoLA')
parser.add_argument('--dataroot', default='./glue_data/')
parser.add_argument('--aug_dataroot', default='./aug_data/')

args = parser.parse_args()

alpha = args.alpha
num_aug = args.num_aug
num_type = args.num_type
task_name = args.task_name
task_dir = os.path.join(args.dataroot, task_name)
task = get_task(task_name.lower(), task_dir)
output_dir = os.path.join(args.aug_dataroot, task_name)

try:
    os.makedirs(output_dir)
except OSError:
    pass

ori_train_df = task.get_train_df()
ori_dev_df = task.get_dev_df()

aug_train_df = pd.DataFrame(columns=["sentence", "label"])

print("Trainning dataset preview:")
print("train sentences num:", len(ori_train_df))
print("Original:", ori_train_df.head())