Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    model_config(parser)
    set_config(parser)
    train_config(parser)
    args = parser.parse_args()
    layer_indexes = [int(x) for x in args.layers.split(",")]
    set_environment(args.seed)
    # process data
    data, is_single_sentence = process_data(args)
    data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis
    collater = Collater(gpu=args.cuda, is_train=False, data_type=data_type)
    batcher = DataLoader(data, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda)
    opt = vars(args)
    # load model
    if os.path.exists(args.checkpoint):
        state_dict = torch.load(args.checkpoint)
        config = state_dict['config']
        config['dump_feature'] = True
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error(
            'Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        return
    num_all_batches = len(batcher)
    model = MTDNNModel(
        opt,
        state_dict=state_dict,
        num_train_step=num_all_batches)
    if args.cuda:
        model.cuda()

    features_dict = {}
    for batch_meta, batch_data in batcher:
        batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data)
        all_encoder_layers, _ = model.extract(batch_meta, batch_data)
        embeddings = [all_encoder_layers[idx].detach().cpu().numpy()
                      for idx in layer_indexes]
        uids = batch_meta['uids']
        masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist()
        for idx, uid in enumerate(uids):
            slen = sum(masks[idx])
            features = {}
            for yidx, layer in enumerate(layer_indexes):
                features[layer] = str(embeddings[yidx][idx][:slen].tolist())
            features_dict[uid] = features

    # save features
    with open(args.foutput, 'w', encoding='utf-8') as writer:
        for sample in data:
            uid = sample['uid']
            tokens = sample['tokens']
            feature = features_dict[uid]
            feature['tokens'] = tokens
            feature['uid'] = uid
            writer.write('{}\n'.format(json.dumps(feature)))
Пример #2
0
def main():
    logger.info('MT-DNN predicting')
    opt = vars(args)
    batch_size = args.batch_size
    test_path = os.path.join(args.data_dir, args.test_file)
    official_score_file = os.path.join(output_dir, args.ouput_path)
    model_path = args.init_checkpoint
    state_dict = None
    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        config = state_dict['config']
        opt.update(config)
        #print(state_dict['state'])
        #if state_dict['config']['ema_opt'] > 0:
        #    new_state_dict = {'state': state_dict['state']['ema'], 'config': state_dict['config']}
        #else:
        #    new_state_dict = {'state': state_dict['state']['network'], 'config': state_dict['config']}
        #state_dict = new_state_dict

    model = MTDNNModel(opt, state_dict=state_dict)

    # task type
    prefix = test_path.split('\\')[-1].split('_')[0]
    pw_task = False
    if prefix in opt['pw_tasks']:
        pw_task = True

    test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task),
                         batch_size=batch_size,
                         gpu=args.cuda,
                         is_train=False,
                         task_id=args.task_id,
                         pairwise=pw_task,
                         maxlen=opt['max_seq_len'])
    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    if args.cuda:
        model.cuda()

    prefix = args.test.split('_')[0]  # 'mnli' #
    label_dict = GLOBAL_MAP.get(prefix, None)
    test_metrics, test_predictions, scores, golds, test_ids = eval_model(
        model, test_data, prefix)
    logger.info('test metrics:{}'.format(test_metrics))

    results = {
        'metrics': test_metrics,
        'uids': test_ids,
        'labels': golds,
        'predictions': test_predictions,
        'scores': scores
    }
    submit(official_score_file, results, label_dict)
Пример #3
0
def main(args):
    # load task info
    task_defs = TaskDefs(args.task_def)
    assert args.task in task_defs.task_type_map
    assert args.task in task_defs.data_type_map
    assert args.task in task_defs.metric_meta_map
    data_type = task_defs.data_type_map[args.task]
    task_type = task_defs.task_type_map[args.task]
    metric_meta = task_defs.metric_meta_map[args.task]

    # load model
    checkpoint_path = args.checkpoint
    assert os.path.exists(checkpoint_path)
    if args.cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path, map_location="cpu")
    config = state_dict['config']
    config["cuda"] = args.cuda
    model = MTDNNModel(config, state_dict=state_dict)
    model.load(checkpoint_path)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)

    # load data
    test_data_set = SingleTaskDataset(args.prep_input,
                                      False,
                                      task_type=task_type,
                                      maxlen=args.max_seq_len)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=args.batch_size_eval,
                           collate_fn=collater.collate_fn,
                           pin_memory=args.cuda)

    with torch.no_grad():
        test_metrics, test_predictions, scores, golds, test_ids = eval_model(
            model,
            test_data,
            metric_meta=metric_meta,
            use_cuda=args.cuda,
            with_label=args.with_label)

        results = {
            'metrics': test_metrics,
            'predictions': test_predictions,
            'uids': test_ids,
            'scores': scores
        }
        dump(args.score, results)
        if args.with_label:
            print(test_metrics)
Пример #4
0
def main():
    # Read in the data
    global model

    nwords = len(w2i)
    ntags = len(t2i)
    nchars = len(c2i)

    if 'rnn' in model_type.lower():
        print ("Running a RNN model")
        model = RNN()
    elif 'cnn' in model_type.lower():
        print ("Running a CNN model")
        model = CNN()
    elif 'bilstm' == model_type.lower():
        print ("Running a BiLSTM char + word model ")
        model = biLstm_with_chars.BiLSTM()
    elif 'bilstm' in  model_type.lower() and 'word' in model_type.lower():
        print ("Running a BiLSTM word only model ")
        model = biLstm.BiLSTM()
    elif 'bilstm' in  model_type.lower() and 'char' in model_type.lower():
        print ("Running a BiLSTM char only model ")
        model = biLstm_char_only.BiLSTM()

    opt = {'log_file': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19/log.log', 'init_checkpoint': '/data/kashyap_data/mt_dnn_models/mt_dnn_large_uncased.pt', 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['scitail'], 'test_datasets': ['scitail'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [
        0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': True, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 16, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]}
    state_dict = torch.load(
        "checkpoint/scitail_model_0.pt")
    config = state_dict['config']
    config['attention_probs_dropout_prob'] = 0.1
    config['hidden_dropout_prob'] = 0.1
    opt.update(config)
    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50)

    print ("building vocabulary...")
    create_vocabulary('data/classes/train.txt')
    print ("done building vocabulary...")
    print ('size of the character vocab %s' %(len(char_vocab_set)))
   # trainer = model.build_model(nwords, nchars, ntags)

    # if input_file != "":
    #     model.load(input_file)

    if 'train' in mode.lower():
        if params['adv_swap'] or params['adv_drop'] or params['adv_key'] \
            or params['adv_add'] or params['adv_all']:
            start_adversarial_training(trainer)
        else:
            start_training(train, dev, trainer)
    elif 'gen' in mode.lower():
        generate_ann()
    elif 'examples' in mode.lower():
        get_qualitative_examples()
    else:
        evaluate()
        if type_of_attack is not None:
            check_against_spell_mistakes('data/classes/test.txt')
Пример #5
0
def make_prediction(Description_A, Description_B, model_path, USE_GPU=True):
    # Loading tokenized using a stored Pickle Object, as it is more reliable
    # In case you'd like to create the object, you can do so here: bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", do_lower_case=True)

    pickle_off = open("tokenizer.pkl", "rb")
    tokenizer = pickle.load(pickle_off)

    # Tokenizes the words into format required by Bert ex: "I am playing" -> ["I","am","play","##ing"]
    hypothesis = tokenizer.tokenize(Description_A)

    #If sequence is too long it truncates it to ensure it fits into BERT's max seq len and changes the words into numbers
    if len(hypothesis) > 512 - 3:
        hypothesis = hypothesis[:512 - 3]
    input_ids = tokenizer.convert_tokens_to_ids(
        ['[CLS]'] + hypothesis + ['[SEP]'])

    #Determnines what sentence it's in, doesn't really matter for single sentence, but important for 2 sentence classification
    type_ids = [0] * (len(hypothesis) + 2)

    #Concatenates all the important labels into a dictionary
    #UID : id number (no importance) ; label: "ground truth" (no importance when making a prediction)
    # token_id: representation of words ; type_id: position within a sentence
    features = {'uid': 0, 'label': 0,
                'token_id': input_ids, 'type_id': type_ids}

    # Loads data into a BatchGen object which is needed for making a prediction, nothing needed to change here
    dev_data = BatchGen([features],
                        batch_size=8,
                        gpu=True, is_train=False,
                        task_id=0,
                        maxlen=512,
                        pairwise=False,
                        data_type=0,
                        task_type=0)

    # function to convert token ids back to words
    print(tokenizer.convert_ids_to_tokens([101, 100, 5208, 2024, 17662, 9119, 2096, 3173, 2000, 2175, 14555, 2044, 2074, 5983, 6265, 1012, 102, 100, 2308, 2024, 23581, 2096, 3173, 2000, 2175, 14555, 1012, 102]))

    #hyper parameters: whatever is necessary is added as variables at the top
    opt = {'init_checkpoint': model_path, 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['sst'], 'test_datasets': ['sst'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [
        0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': USE_GPU, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 32, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]}
    state_dict = torch.load(model_path)
    config = state_dict['config']
    config['attention_probs_dropout_prob'] = 0.1
    config['hidden_dropout_prob'] = 0.1
    opt.update(config)
    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50)

    #actual prediction to be made: main outputs are predictions which is a list of size 1, and scores which is confidence in prediction for each class
    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
        model, dev_data, 0,use_cuda=True, with_label =False)
#model, data, metric_meta, use_cuda=True, with_label=True
    return dev_predictions, scores
Пример #6
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=(prefix!='mednli')),
                                batch_size=batch_size,
                                dropout_w=args.dropout_w,
                                gpu=args.cuda,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type,
                                use_parse=args.use_parse,
                                use_generic_features=args.use_generic_features,
                                use_domain_features=args.use_domain_features,
                                feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir,
                                feature_pkl_namespace='train')
        train_data_list.append(train_data)

    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=False),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type,
                                  use_parse=args.use_parse,
                                  use_generic_features=args.use_generic_features,
                                  use_domain_features=args.use_domain_features,
                                  feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir,
                                  feature_pkl_namespace='dev')
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=False),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type,
                                  use_parse=args.use_parse,
                                  use_generic_features=args.use_generic_features,
                                  use_domain_features=args.use_domain_features,
                                  feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir,
                                  feature_pkl_namespace='test')
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    model_path = args.init_checkpoint
    state_dict = None

    pretrained_embeddings = pretrained_idx2token = None
    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        state_dict.pop('optimizer', None)
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        for k in {'epochs', 'output_dir', 'train_datasets', 'test_datasets', 'seed', 'local_model_idx2token',
                  'use_parse', 'stx_parse_dim', 'glove_path', 'unk_threshold', 'use_generic_features', 'use_domain_features',
                  'feature_dim', 'feature_pkl_dir'}:
            config.pop(k, None)
        opt.update(config)

        if 'treelstm.embedding.weight' in state_dict['state']:
            pretrained_embeddings = state_dict['state']['treelstm.embedding.weight']
            pretrained_idx2token = pickle.load(open(args.local_model_idx2token, 'rb'))
            del state_dict['state']['treelstm.embedding.weight']
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
        opt.update(config)

    assert len(train_data_list) == len(dev_data_list) == len(test_data_list) == 1
    embedding_matrix = token2idx = unked_words = None
    if args.use_parse:
        assert args.stx_parse_dim is not None
        assert args.glove_path is not None
        vocab = Counter()
        for data in train_data_list:
            for batch in data.data:
                for example in batch:
                    for leaf in Tree.from_char_indices(example['parse_id_a']).leaves(): vocab[leaf.content.lower()] += 1
                    for leaf in Tree.from_char_indices(example['parse_id_b']).leaves(): vocab[leaf.content.lower()] += 1
        for data in dev_data_list:
            for batch in data.data:
                for example in batch:
                    for leaf in Tree.from_char_indices(example['parse_id_a']).leaves(): vocab[leaf.content.lower()] += 1
                    for leaf in Tree.from_char_indices(example['parse_id_b']).leaves(): vocab[leaf.content.lower()] += 1

        final_vocab = {'<unk>'}
        unked_words = set()
        for word, count in vocab.items():
            (final_vocab if count >= args.unk_threshold else unked_words).add(word)
        assert len(final_vocab) + len(unked_words) == len(vocab) + 1
        vocab = final_vocab

        idx2token = {}
        for token in vocab:
            idx2token[len(idx2token)] = token
        pickle.dump(idx2token, open(os.path.join(args.output_dir, "idx2token.pkl"), 'wb'))
        token2idx = {token: idx for idx, token in idx2token.items()}
        embedding_matrix = load_embeddings(args.glove_path, vocab, idx2token, pretrained_embeddings=pretrained_embeddings, pretrained_idx2token=pretrained_idx2token)

    num_generic_features = num_domain_features = None
    if args.use_generic_features:
        num_generic_features = len(train_data_list[0].data[0][0]['generic_features'])
    if args.use_domain_features:
        num_domain_features = len(train_data_list[0].data[0][0]['domain_features'])

    all_iters =[iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list)> 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio)))

    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches,
                       use_parse=args.use_parse, embedding_matrix=embedding_matrix,
                       token2idx=token2idx, stx_parse_dim=args.stx_parse_dim, unked_words=unked_words,
                       use_generic_features=args.use_generic_features, num_generic_features=num_generic_features,
                       use_domain_features=args.use_domain_features, num_domain_features=num_domain_features, feature_dim=args.feature_dim)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    highest_dev_acc = -1

    if args.cuda:
        model.cuda()
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices=[]
        if len(train_data_list)> 1 and args.ratio > 0:
            main_indices =[0] * len(train_data_list[0])
            extra_indices=[]
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks=int(min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices, random_picks, replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data= next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates) % args.log_per_updates == 0 or model.updates == 1:
                logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id,
                    model.updates, model.train_loss.avg,
                    str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0]))

        assert len(args.test_datasets) == 1
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model, dev_data, dataset=prefix,
                                                                                 use_cuda=args.cuda)
                assert len(dev_metrics) == 1
                for key, val in dev_metrics.items():
                    logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val))
                score_file = os.path.join(output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores}
                dump(score_file, results)
                official_score_file = os.path.join(output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

                if list(dev_metrics.values())[0] > highest_dev_acc:
                    model.save(os.path.join(output_dir, 'best_model.pt'))
                    highest_dev_acc = list(dev_metrics.values())[0]
                logger.warning(f'Best dev {highest_dev_acc}')
Пример #7
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path,
                                            True,
                                            pairwise=pw_task,
                                            maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              pairwise=pw_task,
                              data_type=data_type,
                              task_type=task_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        if args.mtl_opt > 0:
            task_id = tasks_class[DATA_META[prefix]]
        else:
            task_id = tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path,
                                              False,
                                              pairwise=pw_task,
                                              maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda,
                                is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path,
                                               False,
                                               pairwise=pw_task,
                                               maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda,
                                 is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) *
                                             (1 + args.ratio)))

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error(
            'Could not find the init model!\n The parameters will be initialized randomly!'
        )
        logger.error('#' * 20)
        config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
        opt.update(config)

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and args.ratio > 0:
            main_indices = [0] * len(train_data_list[0])
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks = int(
                min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices,
                                             random_picks,
                                             replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates
                ) % args.log_per_updates == 0 or model.updates == 1:
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(
                        task_id, model.updates, model.train_loss.avg,
                        str((datetime.now() - start) / (i + 1) *
                            (len(all_indices) - i - 1)).split('.')[0]))

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                    model, dev_data, dataset=prefix, use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    logger.warning(
                        "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(
                            dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                # For eval_model function, with_label = True specifies that evaluation metrics will be reported for test data -
                # this was presumably disabled by authors as it is bad practice in hyperparameter tuning, however it is the most convenient
                # way to get test scores. To avoid bias, hyperparameter decisions are made based on dev evaluation metrics, and test metrics
                # are only recorded for the final versions of models.
                test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                    model,
                    test_data,
                    dataset=prefix,
                    use_cuda=args.cuda,
                    with_label=True)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
Пример #8
0
        input_ids = tokenize_fn.convert_tokens_to_ids(['[CLS]'] + tokens_a +
                                                      ['[SEP]'])
        segment_ids = [0] * len(input_ids)
    input_mask = None
    return input_ids, input_mask, segment_ids


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
print('Enter Sentence 1:')
premise = input()
print('Enter Sentence 2:')
hypothesis = input()

input_ids, _, type_ids = bert_feature_extractor(premise,
                                                hypothesis,
                                                max_seq_length=64,
                                                tokenize_fn=tokenizer)
features = {
    'uid': '0',
    'label': '0',
    'token_id': input_ids,
    'type_id': type_ids
}

model_path = 'checkpoints/my_mnli/model_0.pt'
state_dict = torch.load(model_path)
config = state_dict['config']
opt.update(config)
model = MTDNNModel(opt, state_dict=state_dict)
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len, 
                                        opt=opt, dataset=dataset),
                                batch_size=batch_size,
                                dropout_w=args.dropout_w,
                                gpu=args.cuda,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type,
                                dataset_name=dataset)
        train_data.reset()
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        if args.predict_split is not None:
            dev_path = os.path.join(data_dir, '{}_{}.json'.format(dataset, 
                args.predict_split))
        else:
            dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len,
                                            opt=opt, dataset=dataset),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type,
                                  dataset_name=dataset)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, 
                                            maxlen=args.max_seq_len,opt=opt, dataset=dataset),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type,
                                  dataset_name=dataset)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters =[iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(args.external_datasets) > 0 and args.external_include_ratio > 0:
        num_in_domain_batches = args.epochs* sum(all_lens[:-len(args.external_datasets)])
        num_all_batches = num_in_domain_batches * (1 + args.external_include_ratio)
    # pdb.set_trace()

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        if args.init_config is not None: # load huggingface model
            config = json.load(open(args.init_config))
            state_dict={'config':config, 'state':state_dict}
        if args.finetune:
            # only resume config and state
            del_keys=set(state_dict.keys())-set(['config','state'])
            for key in del_keys:
                del state_dict[key]
            resume_configs=json.load(open('config/resume_configs.json'))
            del_keys=set(state_dict['config'].keys())-set(resume_configs)
            for key in del_keys:
                del state_dict['config'][key]
            if args.resume_scoring is not None:                    
                for key in state_dict['state'].keys():
                    if 'scoring_list.0' in key:
                        state_dict['state'][key]=state_dict['state'][key.replace('0',str(args.resume_scoring))]
                        # other scorings will be deleted during loading process, since finetune only has one task
            elif not args.retain_scoring:
                del_keys = [k for k in state_dict['state'] if 'scoring_list' in k]
                for key in del_keys:                    
                    print('deleted previous weight:',key)
                    del state_dict['state'][key]
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
        opt.update(config)

    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    # logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()
    best_epoch=-1
    best_performance=0 
    best_dataset_performance={dataset:{'perf':0,'epoch':-1} for dataset in args.mtl_observe_datasets}
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        if epoch==0 and args.freeze_bert_first:
            model.network.freeze_bert()
            logger.warning('Bert freezed.')
        if epoch==1 and args.freeze_bert_first:
            model.network.unfreeze_bert()
            logger.warning('Bert unfreezed.')
        start = datetime.now()
        all_indices=[]
        if len(args.external_datasets)> 0 and args.external_include_ratio>0:
            main_indices = []
            extra_indices = []
            for data_idx,batcher in enumerate(train_data_list):
                if batcher.dataset_name not in args.external_datasets:
                    main_indices += [data_idx] * len(batcher)
                else:
                    extra_indices += [data_idx] * len(batcher)

            random_picks=int(min(len(main_indices) * args.external_include_ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices, random_picks, replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()
        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)
        if args.test_mode:
            all_indices=all_indices[:2]
        if args.predict_split is not None:
            all_indices=[]
            dev_split=args.predict_split
        else:
            dev_split='dev'

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data= next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates) % args.log_per_updates == 0 or model.updates == 1:
                logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id,
                    model.updates, model.train_loss.avg,
                    str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0]))
        os.system('nvidia-smi')
        for train_data in train_data_list:
            train_data.reset()        

        this_performance={}

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model, dev_data, dataset=prefix,
                                                                                 use_cuda=args.cuda)
                score_file = os.path.join(output_dir, '{}_{}_scores_{}.json'.format(dataset, dev_split, epoch))
                results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores}
                dump(score_file, results)
                official_score_file = os.path.join(output_dir, '{}_{}_scores_{}.csv'.format(dataset, dev_split, epoch))
                submit(official_score_file, results,dataset_name=prefix, threshold=2.0+args.mediqa_score_offset)
                if prefix in mediqa_name_list:
                    logger.warning('self test numbers:{}'.format(dev_metrics))
                    if '_' in dataset:
                        affix = dataset.split('_')[1]
                        ground_truth_path=os.path.join(args.data_root,'mediqa/task3_qa/gt_{}_{}.csv'.format(dev_split,affix))
                    else:
                        ground_truth_path=os.path.join(args.data_root,'mediqa/task3_qa/gt_{}.csv'.format(dev_split))
                    official_result=eval_mediqa_official(pred_path=official_score_file, ground_truth_path=ground_truth_path, 
                        eval_qa_more=args.mediqa_eval_more)
                    logger.warning("MediQA dev eval result:{}".format(official_result))
                    if args.mediqa_eval_more:
                        dev_metrics={'ACC':official_result['score']*100,'Spearman':official_result['score_secondary']*100,
                                    'F1':dev_metrics['F1'], 'MRR':official_result['meta']['MRR'], 'MAP':official_result['MAP'],
                                    'P@1':official_result['meta']['P@1']}
                    else:
                        dev_metrics={'ACC':official_result['score']*100,'Spearman':official_result['score_secondary']*100}

                for key, val in dev_metrics.items():
                    logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val))
            if args.predict_split is not None:
                continue
            print('args.mtl_observe_datasets:',args.mtl_observe_datasets, dataset)
            if dataset in args.mtl_observe_datasets:
                this_performance[dataset]=np.mean([val for val in dev_metrics.values()])
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids= eval_model(model, test_data, dataset=prefix,
                                                                                 use_cuda=args.cuda, with_label=False)
                for key, val in test_metrics.items():
                    logger.warning("Task {0} -- epoch {1} -- Test {2}: {3:.3f}".format(dataset, epoch, key, val))
                score_file = os.path.join(output_dir, '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores}
                dump(score_file, results)
                # if dataset in mediqa_name_list:
                official_score_file = os.path.join(output_dir, '{}_test_scores_{}.csv'.format(dataset, epoch))
                submit(official_score_file, results,dataset_name=prefix, threshold=2.0+args.mediqa_score_offset)
                logger.info('[new test scores saved.]')
        print('this_performance:',this_performance)
        if args.predict_split is not None:
            break
        epoch_performance = sum([val for val in this_performance.values()])
        if epoch_performance>best_performance:
            print('changed:',epoch_performance,best_performance)
            best_performance=epoch_performance
            best_epoch=epoch

        for dataset in args.mtl_observe_datasets:
            if best_dataset_performance[dataset]['perf']<this_performance[dataset]:
                best_dataset_performance[dataset]={'perf':this_performance[dataset],
                                                   'epoch':epoch} 


        print('current best:',best_performance,'at epoch', best_epoch)
        if not args.not_save_model:
            model_name = 'model_last.pt' if args.save_last else 'model_{}.pt'.format(epoch) 
            model_file = os.path.join(output_dir, model_name)
            if args.save_last and os.path.exists(model_file):
                model_temp=os.path.join(output_dir, 'model_secondlast.pt')
                copyfile(model_file, model_temp)
            model.save(model_file)
            if args.save_best and best_epoch==epoch:
                best_path = os.path.join(output_dir,'best_model.pt')
                copyfile(model_file,best_path)
                for dataset in args.mtl_observe_datasets:
                    if best_dataset_performance[dataset]['epoch']==epoch:
                        best_path = os.path.join(output_dir,'best_model_{}.pt'.format(dataset))
                        copyfile(model_file,best_path)
Пример #10
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size

    tasks = {}
    task_def_list = []
    dropout_list = []

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks:
            continue
        task_id = len(tasks)
        tasks[prefix] = task_id
        task_def = task_defs.get_task_def(prefix)
        task_def_list.append(task_def)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_def=task_def)
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type,
                              soft_label=args.mkd_opt > 0)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets,
                                                     args.batch_size,
                                                     args.mix_opt, args.ratio)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['task_def_list'] = task_def_list

    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False, encoder_type=encoder_type)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_def = task_defs.get_task_def(prefix)
        task_id = tasks[prefix]
        task_type = task_def.task_type
        data_type = task_def.data_type

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_def=task_def)
            dev_data = DataLoader(dev_data_set,
                                  batch_size=args.batch_size_eval,
                                  collate_fn=test_collater.collate_fn,
                                  pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_def=task_def)
            test_data = DataLoader(test_data_set,
                                   batch_size=args.batch_size_eval,
                                   collate_fn=test_collater.collate_fn,
                                   pin_memory=args.cuda)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs *
                                            len(multi_task_train_data)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    init_model = args.init_checkpoint
    state_dict = None

    if os.path.exists(init_model):
        state_dict = torch.load(init_model)
        config = state_dict['config']
    else:
        if opt['encoder_type'] not in EncoderModelType._value2member_map_:
            raise ValueError("encoder_type is out of pre-defined types")
        literal_encoder_type = EncoderModelType(
            opt['encoder_type']).name.lower()
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            literal_encoder_type]
        config = config_class.from_pretrained(
            init_model, output_hidden_states=True).to_dict(
            )  # change here to enable multi-layer output

    config['output_hidden_states'] = True
    config['attention_probs_dropout_prob'] = args.bert_dropout_p
    config['hidden_dropout_prob'] = args.bert_dropout_p
    config['multi_gpu_on'] = opt["multi_gpu_on"]
    if args.num_hidden_layers != -1:
        config['num_hidden_layers'] = args.num_hidden_layers
    opt.update(config)

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    # tensorboard
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    if args.encode_mode:
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            test_data = test_data_list[idx]
            with torch.no_grad():
                encoding = extract_encoding(model,
                                            test_data,
                                            use_cuda=args.cuda)
            torch.save(
                encoding,
                os.path.join(output_dir, '{}_encoding.pt'.format(dataset)))
        return

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        start = datetime.now()

        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                args.cuda, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates *
                                        args.grad_accumulation_step
                                        ) == 0 or model.local_updates == 1:
                ramaining_time = str(
                    (datetime.now() - start) / (i + 1) *
                    (len(multi_task_train_data) - i - 1)).split('.')[0]
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            task_def = task_defs.get_task_def(prefix)
            label_dict = task_def.label_vocab
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                        model,
                        dev_data,
                        metric_meta=task_def.metric_meta,
                        use_cuda=args.cuda,
                        label_mapper=label_dict,
                        task_type=task_def.task_type)
                for key, val in dev_metrics.items():
                    if args.tensorboard:
                        tensorboard.add_scalar('dev/{}/{}'.format(
                            dataset, key),
                                               val,
                                               global_step=epoch)
                    if isinstance(val, str):
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format(
                                dataset, epoch, key, val))
                    else:
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(
                                dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                        model,
                        test_data,
                        metric_meta=task_def.metric_meta,
                        use_cuda=args.cuda,
                        with_label=False,
                        label_mapper=label_dict,
                        task_type=task_def.task_type)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_test_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
    if args.tensorboard:
        tensorboard.close()
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size

    # tensorboard
    tensorboard = None
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    json_logfile = os.path.join(args.output_dir, "runtime_log.json")

    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    task_types = []
    dropout_list = []
    loss_types = []
    kd_loss_types = []

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = task_defs.task_type_map[prefix]

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix],
                                    opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)
        task_types.append(task_type)
        loss_types.append(task_defs.loss_map[prefix])
        kd_loss_types.append(task_defs.kd_loss_map[prefix])

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_type=task_type,
                                           data_type=data_type)
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    # MTSampler = SAMPLERS[args.sampler]
    n_tasks = len(tasks)
    dataset_sizes = [len(dataset) for dataset in train_datasets]
    if "random" in args.controller:
        controller = CONTROLLERS[args.controller](
            n_task=n_tasks,
            dataset_names=args.train_datasets,
            dataset_sizes=dataset_sizes,
            batch_size=args.batch_size,
            rebatch_size=args.batch_size_train,
            tensorboard=tensorboard,
            log_filename=json_logfile)
    else:
        controller = CONTROLLERS[args.controller](
            n_task=n_tasks,
            phi=args.phi,
            K=args.concurrent_cnt,
            dataset_names=args.train_datasets,
            dataset_sizes=dataset_sizes,
            max_cnt=args.max_queue_cnt,
            batch_size=args.batch_size,
            rebatch_size=args.batch_size_train,
            tensorboard=tensorboard,
            log_filename=json_logfile)

    multi_task_batch_sampler = ACLSampler(train_datasets,
                                          args.batch_size,
                                          controller=controller)
    # controller.max_step = len(multi_task_batch_sampler)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['answer_opt'] = decoder_opts
    opt['task_types'] = task_types
    opt['tasks_dropout_p'] = dropout_list
    opt['loss_types'] = loss_types
    opt['kd_loss_types'] = kd_loss_types

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False, encoder_type=encoder_type)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            task_defs.
            n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = task_defs.task_type_map[prefix]

        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_type=task_type,
                                             data_type=data_type)
            dev_data = DataLoader(dev_data_set,
                                  batch_size=args.batch_size_eval,
                                  collate_fn=test_collater.collate_fn,
                                  pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_type=task_type,
                                              data_type=data_type)
            test_data = DataLoader(test_data_set,
                                   batch_size=args.batch_size_eval,
                                   collate_fn=test_collater.collate_fn,
                                   pin_memory=args.cuda)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs *
                                            len(multi_task_train_data)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    bert_model_path = args.init_checkpoint
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path)
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error(
                'Could not find the init model!\n The parameters will be initialized randomly!'
            )
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    # add score history
    score_history = [[] for _ in range(len(args.test_datasets))]
    total_scores = []

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {0}/{1}'.format(epoch + 1, args.epochs))
        start = datetime.now()
        total_len = len(controller)
        controller.set_epoch(epoch)
        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                args.cuda, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            loss = model.calculate_loss(batch_meta, batch_data)
            controller.insert(task_id, (batch_meta, batch_data), loss.item())

            if i % args.log_per_updates == 0:
                ramaining_time = str(
                    (datetime.now() - start) / (controller.cur_step + 1) *
                    (total_len - controller.cur_step - 1)).split('.')[0]
                logger.info("Epoch {0} Progress {1} / {2}  ({3:.2%})".format(
                    epoch + 1, controller.cur_step, total_len,
                    controller.cur_step * 1.0 / total_len))
                # logger.info("Progress {0} / {1}  ({2:.2f}%)".format(i, total_len, i*100.0/total_len))
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))

                summary_str = controller.summary()
                for line in summary_str.split("\n"):
                    logger.info(line)

                # avg_loss, out_loss, loss_change, min_loss, min_out_loss = controller.get_loss()
                # logger.info('List of loss {}'.format(",".join(avg_loss)))
                # logger.info('List of out_loss {}'.format(",".join(out_loss)))
                # logger.info('List of loss_change {}'.format(",".join(loss_change)))
                # logger.info('List of min_loss {}'.format(",".join(min_loss)))
                # logger.info('List of min_out_loss {}'.format(",".join(min_out_loss)))
                # chosen = [ "%s:%.3f "%(k,v) for k, v in controller.scaled_dict.items()]
                # logger.info('List of Scaled Choosen time {}'.format(",".join(chosen)))

                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            controller.step(model=model)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        total_average_score = 0.0
        scoring_cnt = 0
        score_dict = dict()
        scoring_datasets = "cola,sst,mrpc,stsb,qqp,mnli,qnli,rte,wnli".split(
            ",")
        logger.info('Start Testing')
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = task_defs.global_map.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                        model,
                        dev_data,
                        metric_meta=task_defs.metric_meta_map[prefix],
                        use_cuda=args.cuda,
                        label_mapper=label_dict,
                        task_type=task_defs.task_type_map[prefix])
                task_score = 0.0
                for key, val in dev_metrics.items():
                    if args.tensorboard:
                        tensorboard.add_scalar('dev/{}/{}'.format(
                            dataset, key),
                                               val,
                                               global_step=epoch)
                    if isinstance(val, str):
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format(
                                dataset, epoch + 1, key, val))
                    else:
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}: {3:.2f}'.format(
                                dataset, epoch + 1, key, val))
                    task_score += val
                if len(dev_metrics) > 1:
                    task_score /= len(dev_metrics)
                    logger.warning(
                        'Task {0} -- epoch {1} -- Dev {2}: {3:.2f}'.format(
                            dataset, epoch + 1, "Average", task_score))
                if prefix in scoring_datasets:
                    scoring_cnt += 1
                    if prefix not in score_dict:
                        score_dict[prefix] = task_score
                    else:
                        score_dict[prefix] = (score_dict[prefix] +
                                              task_score) / 2
                    total_average_score += task_score

                score_history[idx].append("%.2f" % task_score)
                logger.warning('Task {0} -- epoch {1} -- Dev {2}: {3}'.format(
                    dataset, epoch + 1, "History", score_history[idx]))

                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                        model,
                        test_data,
                        metric_meta=task_defs.metric_meta_map[prefix],
                        use_cuda=args.cuda,
                        with_label=False,
                        label_mapper=label_dict,
                        task_type=task_defs.task_type_map[prefix])
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_test_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')
        scoreing_cnt = len(score_dict)
        if scoreing_cnt > 0:
            mean_value = np.mean([v for k, v in score_dict.items()])
            logger.warning(
                'Epoch {0} -- Dev {1} Tasks, Average Score : {2:.3f}'.format(
                    epoch + 1, scoring_cnt, mean_value))
            score_dict['avg'] = mean_value
            total_scores.append(score_dict)

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
    for i, total_score in enumerate(total_scores):
        logger.info(total_score)

    if args.tensorboard:
        tensorboard.close()
Пример #12
0
    device = torch.device("cpu")

state_dict = torch.load(checkpoint_path, map_location=device)

config = state_dict["config"]
config["cuda"] = args.cuda
task_def = task_defs.get_task_def(prefix)
task_def_list = [task_def]
config["task_def_list"] = task_def_list
## temp fix
config["fp16"] = False
config["answer_opt"] = 0
config["adv_train"] = False
del state_dict["optimizer"]

model = MTDNNModel(config, device=device, state_dict=state_dict)
encoder_type = config.get("encoder_type", EncoderModelType.BERT)
# load data
test_data_set = SingleTaskDataset(
    args.prep_input,
    False,
    maxlen=args.max_seq_len,
    task_id=args.task_id,
    task_def=task_def,
)
collater = Collater(is_train=False, encoder_type=encoder_type)
test_data = DataLoader(
    test_data_set,
    batch_size=args.batch_size_eval,
    collate_fn=collater.collate_fn,
    pin_memory=args.cuda,
Пример #13
0
def load_model_for_viz_0(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ####### temp fix #######
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    del state_dict['optimizer']
    #########################
    model = MTDNNModel(config, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    return model.mnetwork.module.bert, config, test_data
Пример #14
0
def main():
    parser = argparse.ArgumentParser()
    #   Required parameters
    parser.add_argument("--task_def",
                        type=str,
                        required=True,
                        default="experiments/glue/glue_task_def.yml")
    parser.add_argument("--task", type=str, required=True)
    parser.add_argument("--task_id",
                        type=int,
                        default=0,
                        help="the id of this task when training")
    parser.add_argument("--checkpoint",
                        default='mt_dnn_models/bert_model_base_uncased.pt',
                        type=str)
    parser.add_argument(
        "--output_dir",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/checkpoints/bert-cased_lcp-single_2020-12-23T2029/',
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--prep_input",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/data_complex/bert_base_cased/lcp_dev.json',
        type=str,
        required=True,
    )
    parser.add_argument(
        '--bert_model_type',
        default='bert-base-cased',
        type=str,
        help="What type of bert model should we be using",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Pretrained config name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Pretrained tokenizer name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Whether to overwrite data in output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer",
                        action="store_true",
                        help="Don't normalize importance score by layers")
    parser.add_argument(
        "--dont_normalize_global_importance",
        action="store_true",
        help="Don't normalize all importance scores between 0 and 1",
    )

    parser.add_argument(
        "--try_masking",
        action="store_true",
        help="Whether to try to mask head until a threshold of accuracy.")
    parser.add_argument(
        "--masking_threshold",
        default=0.9,
        type=float,
        help=
        "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
    )
    parser.add_argument(
        "--masking_amount",
        default=0.1,
        type=float,
        help="Amount to heads to masking at each masking step.")
    parser.add_argument("--metric_name",
                        default="acc",
                        type=str,
                        help="Metric to use for head masking.")

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, sequences shorter padded.",
    )
    # temp fix: technically these parameters should've already bin in checkpoint's config...
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="For distributed training: world size")

    parser.add_argument("--batch_size",
                        default=8,
                        type=int,
                        help="Batch size.")
    parser.add_argument("--seed", type=int, default=2018)
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--cuda',
                        type=bool,
                        default=torch.cuda.is_available(),
                        help='whether to use GPU acceleration.')
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_proper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_improper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup devices and distributed training
    device = torch.device("cuda")
    if args.local_rank > -1:
        device = initialize_distributed(args)
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # load task info
    task = args.task
    task_defs = TaskDefs(args.task_def)
    assert args.task in task_defs._task_type_map
    assert args.task in task_defs._data_type_map
    assert args.task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[args.task]
    task_type = task_defs._task_type_map[args.task]
    metric_meta = task_defs._metric_meta_map[args.task]
    # load model
    checkpoint_path = args.checkpoint
    assert os.path.exists(checkpoint_path)
    if args.cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path, map_location="cpu")
    opt = state_dict['config']
    args.bin_on = False
    opt.update(vars(args))
    model = MTDNNModel(opt, device=device, state_dict=state_dict)

    # Load pretrained model and tokenizer
    # Load data
    data = pd.read_csv('data_complex/lcp_test.tsv',
                       sep='\t',
                       header=None,
                       names=['idx', 'complexity', 'sentence', 'token'])
    data['complexity'] = np.load(
        '/content/gdrive/My Drive/Colab Notebooks/cs99/from_macbook/single_test_labels.npy'
    )
    data['class'] = pd.cut(data['complexity'],
                           labels=[1, 2, 3, 4, 5],
                           bins=[0, 0.2, 0.4, 0.6, 0.8, 1],
                           include_lowest=True)
    data['sent_len'] = data['sentence'].str.len()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/lcp_test_scores_epoch_4.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['finetuned_complexity'] = single_dev_bert_scores['scores']
        data['finetuned_error'] = data['finetuned_complexity'] - data[
            'complexity']
        data['finetuned_abs_error'] = (data['finetuned_complexity'] -
                                       data['complexity']).abs()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/pretrained.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['pretrained_complexity'] = single_dev_bert_scores['scores']
        data['pretrained_error'] = data['pretrained_complexity'] - data[
            'complexity']
        data['pretrained_abs_error'] = (data['pretrained_complexity'] -
                                        data['complexity']).abs()
    data['improvement'] = data['pretrained_abs_error'] - data[
        'finetuned_abs_error']
    data['proper'] = data['token'].apply(lambda x: x[0].isupper())
    # Distributed training:
    # download model & vocab.
    printable = opt['local_rank'] in [-1, 0]
    encoder_type = opt.get('encoder_type', EncoderModelType.BERT)
    collater = Collater(is_train=True,
                        encoder_type=encoder_type,
                        max_seq_len=opt['max_seq_len'],
                        do_padding=opt['do_padding'])
    dev_data = SingleTaskDataset(opt['prep_input'],
                                 True,
                                 maxlen=opt['max_seq_len'],
                                 task_id=opt['task_id'],
                                 task_def=task_def,
                                 printable=printable)
    if args.do_proper:
        dev_data._data = np.array(
            dev_data._data)[data[data['proper']]['idx'].to_numpy()].tolist()
    if args.do_improper:
        dev_data._data = np.array(
            dev_data._data)[data[~data['proper']]['idx'].to_numpy()].tolist()
    dev_data_loader = DataLoader(dev_data,
                                 batch_size=opt['batch_size_eval'],
                                 collate_fn=collater.collate_fn,
                                 pin_memory=opt['cuda'])

    # Compute head entropy and importance score
    results = []
    for seed in tqdm(range(2010 + 1, 2020 + 1)):  # Set seeds
        set_seed(seed)
        attn_entropy, head_importance, preds, labels = compute_heads_importance(
            opt, model, dev_data_loader)
        results.append((attn_entropy, head_importance))
    pkl.dump(
        results,
        open('checkpoints/bert-cased_lcp-single_2021-01-19T0309/results.pkl',
             'wb'))

    # Try head masking (set heads to zero until the score goes under a threshold)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
        head_mask = mask_heads(opt, model, dev_data_loader)
Пример #15
0
# load data
test_data = BatchGen(BatchGen.load(args.prep_input, False, pairwise=pw_task, maxlen=args.max_seq_len),
                     batch_size=args.batch_size_eval,
                     gpu=args.cuda, is_train=False,
                     task_id=args.task_id,
                     maxlen=args.max_seq_len,
                     pairwise=pw_task,
                     data_type=data_type,
                     task_type=task_type)

# load model
checkpoint_path = args.checkpoint
assert os.path.exists(checkpoint_path)
if args.cuda:
    state_dict = torch.load(checkpoint_path)
else:
    state_dict = torch.load(checkpoint_path, map_location="cpu")
config = state_dict['config']
config["cuda"] = args.cuda
model = MTDNNModel(config, state_dict=state_dict)

test_metrics, test_predictions, scores, golds, test_ids = eval_model(model, test_data,
                                                                     metric_meta=metric_meta,
                                                                     use_cuda=args.cuda, with_label=args.with_label)

results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores}
dump(args.score, results)
if args.with_label:
    print(test_metrics)
Пример #16
0
def main():
    task_def_path = 'data_complex/lcp.yml'
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    parser = argparse.ArgumentParser()
    model_config(parser)
    set_config(parser)
    train_config(parser)
    args = parser.parse_args()
    encoder_type = args.encoder_type
    layer_indexes = [int(x) for x in args.layers.split(",")]
    set_environment(args.seed)
    # process data
    data, is_single_sentence = process_data(args)
    data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis
    fout_temp = '{}.tmp'.format(args.finput)
    dump_data(data, fout_temp)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type)
    batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda)
    opt = vars(args)
    # load model
    if os.path.exists(args.checkpoint):
        state_dict = torch.load(args.checkpoint)
        config = state_dict['config']
        config['dump_feature'] = True
        config['local_rank'] = -1
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error(
            'Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        return
    num_all_batches = len(batcher)
    model = MTDNNModel(
        opt,
        state_dict=state_dict,
        num_train_step=num_all_batches)
    if args.cuda:
        model.cuda()

    features_dict = {}
    for batch_meta, batch_data in batcher:
        batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data)
        all_encoder_layers, _ = model.extract(batch_meta, batch_data)
        embeddings = [all_encoder_layers[idx].detach().cpu().numpy()
                      for idx in layer_indexes]

        #import pdb; pdb.set_trace()
        uids = batch_meta['uids']
        masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist()
        for idx, uid in enumerate(uids):
            slen = sum(masks[idx])
            features = {}
            for yidx, layer in enumerate(layer_indexes):
                features[layer] = str(embeddings[yidx][idx][:slen].tolist())
            features_dict[uid] = features

    # save features
    with open(args.foutput, 'w', encoding='utf-8') as writer:
        for sample in data:
            uid = sample['uid']
            tokens = sample['tokens']
            feature = features_dict[uid]
            feature['tokens'] = tokens
            feature['uid'] = uid
            writer.write('{}\n'.format(json.dumps(feature)))
Пример #17
0
    'freeze_layers': -1,
    'embedding_opt': 0,
    'lr_gamma': 0.5,
    'bert_l2norm': 0.0,
    'scheduler_type': 'ms',
    'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19',
    'seed': 2018,
    'task_config_path': 'configs/tasks_config.json',
    'tasks_dropout_p': [0.1]
}
state_dict = torch.load("checkpoint/snli_model_0.pt")
config = state_dict['config']
config['attention_probs_dropout_prob'] = 0.1
config['hidden_dropout_prob'] = 0.1
opt.update(config)
model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50)

test_metrics, test_predictions, scores, golds, test_ids = eval_model(
    model,
    test_data,
    metric_meta=metric_meta,
    use_cuda=args.cuda,
    with_label=args.with_label)

results = {
    'metrics': test_metrics,
    'predictions': test_predictions,
    'uids': test_ids,
    'scores': scores
}
dump(args.score, results)
Пример #18
0
def main():
    global tokenizer, test_collater, model
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    task_types = []
    dropout_list = []
    loss_types = []
    kd_loss_types = []

    #train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        task_type = task_defs.task_type_map[prefix]

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)
        task_types.append(task_type)
        loss_types.append(task_defs.loss_map[prefix])
        kd_loss_types.append(task_defs.kd_loss_map[prefix])

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        # train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id,
        #                                    task_type=task_type, data_type=data_type)
        # train_datasets.append(train_data_set)
    #train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type)
    # multi_task_train_dataset = MultiTaskDataset(train_datasets)
    # multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets, args.batch_size, args.mix_opt, args.ratio)
    # multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler,
    #                                    collate_fn=train_collater.collate_fn, pin_memory=args.cuda)

    opt['answer_opt'] = decoder_opts
    opt['task_types'] = task_types
    opt['tasks_dropout_p'] = dropout_list
    opt['loss_types'] = loss_types
    opt['kd_loss_types'] = kd_loss_types

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False, encoder_type=encoder_type)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    bert_model_path = 'checkpoints/my_mnli/model_0.pt'
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path, map_location=torch.device('cpu'))
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error('Could not find the init model!\n The parameters will be initialized randomly!')
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    model = MTDNNModel(opt, state_dict=state_dict)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    # # tensorboard
    # if args.tensorboard:
    #     args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir)
    #     tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
Пример #19
0
assert args.task in task_defs.data_type_map
assert args.task in task_defs.metric_meta_map
data_type = task_defs.data_type_map[args.task]
task_type = task_defs.task_type_map[args.task]
metric_meta = task_defs.metric_meta_map[args.task]

# load model
checkpoint_path = args.checkpoint
assert os.path.exists(checkpoint_path)
if args.cuda:
    state_dict = torch.load(checkpoint_path)
else:
    state_dict = torch.load(checkpoint_path, map_location="cpu")
config = state_dict['config']
config["cuda"] = args.cuda
model = MTDNNModel(config, state_dict=state_dict)
model.load(checkpoint_path)
encoder_type = config.get('encoder_type', EncoderModelType.BERT)
# load data
test_data = BatchGen(BatchGen.load(args.prep_input,
                                   False,
                                   task_type=task_type,
                                   maxlen=args.max_seq_len),
                     batch_size=args.batch_size_eval,
                     gpu=args.cuda,
                     is_train=False,
                     task_id=args.task_id,
                     maxlen=args.max_seq_len,
                     data_type=data_type,
                     task_type=task_type,
                     encoder_type=encoder_type)
Пример #20
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_data_ratio_string = str(
            args.train_data_ratio) + "p" if args.train_data_ratio < 100 else ""

        train_path = os.path.join(
            data_dir, '{0}_train{1}.json'.format(dataset,
                                                 train_data_ratio_string))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path,
                                            True,
                                            pairwise=pw_task,
                                            maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              pairwise=pw_task,
                              data_type=data_type,
                              task_type=task_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path,
                                              False,
                                              pairwise=pw_task,
                                              maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda,
                                is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path,
                                               False,
                                               pairwise=pw_task,
                                               maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda,
                                 is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) *
                                             (1 + args.ratio)))

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path, map_location='cpu')
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n Exit application!')
        logger.error('#' * 20)
        try:
            shutil.rmtree(output_dir)
        except Exception as e:
            print(e)
        exit(1)

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()

    best_F1_macro = -1.0
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and (args.ratio > 0 or
                                         args.reduce_first_dataset_ratio > 0):
            main_indices = [0] * (int(args.reduce_first_dataset_ratio * len(
                train_data_list[0])) if args.reduce_first_dataset_ratio > 0
                                  else len(train_data_list[0]))
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            if args.ratio > 0:
                random_picks = int(
                    min(
                        len(train_data_list[0]) * args.ratio,
                        len(extra_indices)))
                extra_indices = np.random.choice(extra_indices,
                                                 random_picks,
                                                 replace=False).tolist()
            if args.mix_opt > 0:
                extra_indices = extra_indices
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices
            logger.info(
                "Main batches loaded (first dataset in list): {}".format(
                    len(main_indices)))
            logger.info(
                "Extra batches loaded (all except first dataset in list): {}".
                format(len(extra_indices)))

        else:  # shuffle the index of the train sets whose batches will be trained on in the order: e.g. if train_set[1] is large, it will get trained on more often
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates
                ) % args.log_per_updates == 0 or model.updates == 1:
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(
                        task_id, model.updates, model.train_loss.avg,
                        str((datetime.now() - start) / (i + 1) *
                            (len(all_indices) - i - 1)).split('.')[0]))

        temp_dev_F1s = []
        dev_dump_list = []
        test_dump_list = []
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model(
                    model, dev_data, dataset=prefix, use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    if not isinstance(val, dict):
                        logger.warning(
                            "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(
                                dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores,
                    'golds': golds,
                    'premises': premises,
                    'hypotheses': hypotheses
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

                # for checkpoint
                temp_dev_F1s.append(dev_metrics['F1_macro'])
                dev_dump_list.append({
                    "output_dir": output_dir,
                    "dev_metrics": dev_metrics,
                    "dev_predictions": dev_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model(
                    model,
                    test_data,
                    dataset=prefix,
                    use_cuda=args.cuda,
                    with_label=True)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores,
                    'golds': golds,
                    'premises': premises,
                    'hypotheses': hypotheses
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

                # for checkpoint
                test_dump_list.append({
                    "output_dir": output_dir,
                    "test_metrics": test_metrics,
                    "test_predictions": test_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

        # save checkpoint
        if np.average(temp_dev_F1s) > best_F1_macro:
            print("Save new model! Current best F1 macro over all dev sets: " +
                  "{0:.2f}".format(best_F1_macro) + ". New: " +
                  "{0:.2f}".format(np.average(temp_dev_F1s)))
            best_F1_macro = np.average(temp_dev_F1s)

            # override current dump file
            for l in dev_dump_list:
                dump_result_files(l['dataset'])(l['output_dir'], epoch,
                                                l['dev_metrics'],
                                                str(l['dev_predictions']),
                                                str(l['golds']), "dev",
                                                l['opt'], l['dataset'])

            for l in test_dump_list:
                dump_result_files(l['dataset'])(l['output_dir'], epoch,
                                                l['test_metrics'],
                                                str(l['test_predictions']),
                                                str(l['golds']), "test",
                                                l['opt'], l['dataset'])

            # save model
            model_file = os.path.join(output_dir, 'model.pt')
            model.save(model_file)
Пример #21
0
def load_model_for_viz_1(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ## temp fix
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    #del state_dict['optimizer']
    config['output_attentions'] = True
    config['local_rank'] = -1
    model = MTDNNModel(config, device, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    for batch_meta, batch_data in tqdm(test_data):
        if idx < 360:
            idx += 1
            continue
        batch_meta, batch_data = Collater.patch_data(device, batch_meta,
                                                     batch_data)
        model.network.eval()
        task_id = batch_meta['task_id']
        task_def = TaskDef.from_dict(batch_meta['task_def'])
        task_type = task_def.task_type
        task_obj = tasks.get_task_obj(task_def)
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        input_ids = inputs[0]
        token_type_ids = inputs[1]
        attention = model.mnetwork.module.bert(
            input_ids, token_type_ids=token_type_ids)[-1]
        batch_size = batch_data[0].shape[0]
        for i in range(batch_size):
            attention = tuple([item[i:i + 1, :, :, :] for item in attention])
            input_id_list = input_ids[i].tolist()
            tokens = tokenizer.convert_ids_to_tokens(input_id_list)
            idx_sep = listRightIndex(tokens, '[SEP]') + 1
            tokens = tokens[:idx_sep]
            attention = tuple(
                [item[:, :, :idx_sep, :idx_sep] for item in attention])
            results.append((attention, tokens))
        idx += batch_size
    return results
Пример #22
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    stress_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda, is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda, is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type)
        test_data_list.append(test_data)

        stress_data = []
        if args.stress_tests != "NONE":
            for stress_test in args.stress_tests.split(','):
                stress_path = os.path.join(data_dir, '{}_test_{}.json'.format(dataset, stress_test))
                if os.path.exists(stress_path):
                    stress_data.append(BatchGen(BatchGen.load(stress_path, False, pairwise=pw_task, maxlen=args.max_seq_len),
                                         batch_size=args.batch_size_eval,
                                         gpu=args.cuda, is_train=False,
                                         task_id=task_id,
                                         maxlen=512,
                                         pairwise=pw_task,
                                         data_type=data_type,
                                         task_type=task_type)  )
            stress_data_list.append(stress_data)


    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio)))

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n Exit application!')
        logger.error('#' * 20)


    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()
    for epoch in range(0, 1):
        dev_dump_list = []
        test_dump_list = []
        stress_dump_list = []
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model(model, dev_data, dataset=prefix,
                                                                                 use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    if not isinstance(val, dict):
                        logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val))

                if args.dump_to_checkpoints == 1:
                    score_file = os.path.join(output_dir, '{}_dev_scores_{}_EVAL_ONLY.json'.format(dataset, epoch))
                    results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids,
                               'scores': scores, 'golds': golds,
                               'premises': premises, 'hypotheses': hypotheses}
                    dump(score_file, results)
                    official_score_file = os.path.join(output_dir,
                                                       '{}_dev_scores_{}_EVAL_ONLY.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

                # for checkpoint
                dev_dump_list.append({
                    "output_dir": output_dir,
                    "dev_metrics": dev_metrics,
                    "dev_predictions": dev_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model(model, test_data, dataset=prefix,
                                                                                 use_cuda=args.cuda, with_label=True)

                if args.dump_to_checkpoints == 1:
                    score_file = os.path.join(output_dir, '{}_test_scores_{}_EVAL_ONLY.json'.format(dataset, epoch))
                    results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores, 'golds': golds,
                               'premises': premises, 'hypotheses': hypotheses}
                    dump(score_file, results)
                    official_score_file = os.path.join(output_dir, '{}_test_scores_{}_EVAL_ONLY.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                    logger.info('[new test scores saved.]')

                # for checkpoint
                test_dump_list.append({
                    "output_dir": output_dir,
                    "test_metrics": test_metrics,
                    "test_predictions": test_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

            # stress test eval
            if args.stress_tests != "NONE":
                stress_data = stress_data_list[idx]
                for j, stress_test in enumerate(args.stress_tests.split(',')):
                    stress_metrics, stress_predictions, scores, golds, stress_ids, premises, hypotheses = \
                        eval_model(model, stress_data[j], dataset=prefix, use_cuda=args.cuda, with_label=True)

                    if args.dump_to_checkpoints == 1:
                        score_file = os.path.join(output_dir, '{}_test_{}_scores_{}_EVAL_ONLY.json'.format(dataset, stress_test, epoch))
                        results = {'metrics': stress_metrics, 'predictions': stress_predictions, 'uids': stress_ids, 'scores': scores, 'golds': golds,
                                   'premises': premises, 'hypotheses': hypotheses}
                        dump(score_file, results)
                        official_score_file = os.path.join(output_dir, '{}_test_{}_scores_{}_EVAL_ONLY.tsv'.format(dataset, stress_test, epoch))
                        submit(official_score_file, results, label_dict)
                        logger.info('[new stress test scores for "{}" saved.]'.format(stress_test))

                    # for checkpoint
                    stress_dump_list.append({
                        "output_dir": output_dir,
                        "test_metrics": stress_metrics,
                        "test_predictions": stress_predictions,
                        "golds": golds,
                        "opt": opt,
                        "dataset": dataset,
                        "stress_test": stress_test
                    })



        # save results
        print("Save new results!")

        for l in dev_dump_list:
            dump_result_files(l['dataset'])(l['output_dir'], -1, l['dev_metrics'], str(l['dev_predictions']),
                                            str(l['golds']), "dev", l['opt'], l['dataset'])
        for l in test_dump_list:
            dump_result_files(l['dataset'])(l['output_dir'], -1, l['test_metrics'], str(l['test_predictions']),
                                            str(l['golds']), "test", l['opt'], l['dataset'])

        if args.stress_tests != "NONE":
            for l in stress_dump_list:
                dump_result_files(l['dataset'])(l['output_dir'], -1, l['test_metrics'], str(l['test_predictions']),
                                                str(l['golds']), l['stress_test'], l['opt'], l['dataset'])
Пример #23
0
def main():
    # set up dist
    device = torch.device("cuda")
    if args.local_rank > -1:
        device = initialize_distributed(args)
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    print_message(logger, 'Launching the MT-DNN training')
    #return
    tasks = {}
    task_def_list = []
    dropout_list = []
    printable = args.local_rank in [-1, 0]

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks:
            continue
        task_id = len(tasks)
        tasks[prefix] = task_id
        task_def = task_defs.get_task_def(prefix)
        task_def_list.append(task_def)
        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        print_message(logger,
                      'Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_def=task_def,
                                           printable=printable)
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type,
                              soft_label=args.mkd_opt > 0,
                              max_seq_len=args.max_seq_len,
                              do_padding=args.do_padding)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    if args.local_rank != -1:
        multi_task_batch_sampler = DistMultiTaskBatchSampler(
            train_datasets,
            args.batch_size,
            args.mix_opt,
            args.ratio,
            rank=args.local_rank,
            world_size=args.world_size)
    else:
        multi_task_batch_sampler = MultiTaskBatchSampler(
            train_datasets,
            args.batch_size,
            args.mix_opt,
            args.ratio,
            bin_on=args.bin_on,
            bin_size=args.bin_size,
            bin_grow_ratio=args.bin_grow_ratio)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['task_def_list'] = task_def_list

    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False,
                             encoder_type=encoder_type,
                             max_seq_len=args.max_seq_len,
                             do_padding=args.do_padding)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_def = task_defs.get_task_def(prefix)
        task_id = tasks[prefix]
        task_type = task_def.task_type
        data_type = task_def.data_type

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_def=task_def,
                                             printable=printable)
            if args.local_rank != -1:
                dev_data_set = DistTaskDataset(dev_data_set, task_id)
                single_task_batch_sampler = DistSingleTaskBatchSampler(
                    dev_data_set,
                    args.batch_size_eval,
                    rank=args.local_rank,
                    world_size=args.world_size)
                dev_data = DataLoader(dev_data_set,
                                      batch_sampler=single_task_batch_sampler,
                                      collate_fn=test_collater.collate_fn,
                                      pin_memory=args.cuda)
            else:
                dev_data = DataLoader(dev_data_set,
                                      batch_size=args.batch_size_eval,
                                      collate_fn=test_collater.collate_fn,
                                      pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_def=task_def,
                                              printable=printable)
            if args.local_rank != -1:
                test_data_set = DistTaskDataset(test_data_set, task_id)
                single_task_batch_sampler = DistSingleTaskBatchSampler(
                    test_data_set,
                    args.batch_size_eval,
                    rank=args.local_rank,
                    world_size=args.world_size)
                test_data = DataLoader(test_data_set,
                                       batch_sampler=single_task_batch_sampler,
                                       collate_fn=test_collater.collate_fn,
                                       pin_memory=args.cuda)
            else:
                test_data = DataLoader(test_data_set,
                                       batch_size=args.batch_size_eval,
                                       collate_fn=test_collater.collate_fn,
                                       pin_memory=args.cuda)
        test_data_list.append(test_data)

    print_message(logger, '#' * 20)
    print_message(logger, opt)
    print_message(logger, '#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    print_message(logger,
                  '############# Gradient Accumulation Info #############')
    print_message(
        logger,
        'number of step: {}'.format(args.epochs * len(multi_task_train_data)))
    print_message(
        logger, 'number of grad grad_accumulation step: {}'.format(
            args.grad_accumulation_step))
    print_message(logger,
                  'adjusted number of step: {}'.format(num_all_batches))
    print_message(logger,
                  '############# Gradient Accumulation Info #############')

    init_model = args.init_checkpoint
    state_dict = None

    if os.path.exists(init_model):
        if encoder_type == EncoderModelType.BERT or \
            encoder_type == EncoderModelType.DEBERTA or \
            encoder_type == EncoderModelType.ELECTRA:
            state_dict = torch.load(init_model, map_location=device)
            config = state_dict['config']
        elif encoder_type == EncoderModelType.ROBERTA or encoder_type == EncoderModelType.XLM:
            model_path = '{}/model.pt'.format(init_model)
            state_dict = torch.load(model_path, map_location=device)
            arch = state_dict['args'].arch
            arch = arch.replace('_', '-')
            if encoder_type == EncoderModelType.XLM:
                arch = "xlm-{}".format(arch)
            # convert model arch
            from data_utils.roberta_utils import update_roberta_keys
            from data_utils.roberta_utils import patch_name_dict
            state = update_roberta_keys(
                state_dict['model'], nlayer=state_dict['args'].encoder_layers)
            state = patch_name_dict(state)
            literal_encoder_type = EncoderModelType(
                opt['encoder_type']).name.lower()
            config_class, model_class, tokenizer_class = MODEL_CLASSES[
                literal_encoder_type]
            config = config_class.from_pretrained(arch).to_dict()
            state_dict = {'state': state}
    else:
        if opt['encoder_type'] not in EncoderModelType._value2member_map_:
            raise ValueError("encoder_type is out of pre-defined types")
        literal_encoder_type = EncoderModelType(
            opt['encoder_type']).name.lower()
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            literal_encoder_type]
        config = config_class.from_pretrained(init_model).to_dict()

    config['attention_probs_dropout_prob'] = args.bert_dropout_p
    config['hidden_dropout_prob'] = args.bert_dropout_p
    config['multi_gpu_on'] = opt["multi_gpu_on"]
    if args.num_hidden_layers > 0:
        config['num_hidden_layers'] = args.num_hidden_layers

    opt.update(config)

    model = MTDNNModel(opt,
                       device=device,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        print_message(logger, 'loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    print_message(logger, '\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    print_message(logger,
                  "Total number of params: {}".format(model.total_param))

    # tensorboard
    tensorboard = None
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    if args.encode_mode:
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            test_data = test_data_list[idx]
            with torch.no_grad():
                encoding = extract_encoding(model,
                                            test_data,
                                            use_cuda=args.cuda)
            torch.save(
                encoding,
                os.path.join(output_dir, '{}_encoding.pt'.format(dataset)))
        return

    for epoch in range(0, args.epochs):
        print_message(logger, 'At epoch {}'.format(epoch), level=1)
        start = datetime.now()

        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                device, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            model.update(batch_meta, batch_data)

            if (model.updates) % (
                    args.log_per_updates) == 0 or model.updates == 1:
                ramaining_time = str(
                    (datetime.now() - start) / (i + 1) *
                    (len(multi_task_train_data) - i - 1)).split('.')[0]
                if args.adv_train and args.debug:
                    debug_info = ' basic loss[%.5f] adv loss[%.5f] emb val[%.8f] noise val[%.8f] noise grad val[%.8f] no proj noise[%.8f] ' % (
                        model.basic_loss.avg, model.adv_loss.avg,
                        model.emb_val.avg, model.noise_val.avg,
                        model.noise_grad_val.avg, model.no_proj_noise_val.avg)
                else:
                    debug_info = ' '
                print_message(
                    logger,
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}]{3}remaining[{4}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            debug_info, ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step)
                    == 0) and args.local_rank in [-1, 0]:
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                evaluation(model,
                           args.test_datasets,
                           dev_data_list,
                           task_defs,
                           output_dir,
                           epoch,
                           n_updates=args.save_per_updates,
                           with_label=True,
                           tensorboard=tensorboard,
                           glue_format_on=args.glue_format_on,
                           test_on=False,
                           device=device,
                           logger=logger)
                evaluation(model,
                           args.test_datasets,
                           test_data_list,
                           task_defs,
                           output_dir,
                           epoch,
                           n_updates=args.save_per_updates,
                           with_label=False,
                           tensorboard=tensorboard,
                           glue_format_on=args.glue_format_on,
                           test_on=True,
                           device=device,
                           logger=logger)
                print_message(logger,
                              'Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        evaluation(model,
                   args.test_datasets,
                   dev_data_list,
                   task_defs,
                   output_dir,
                   epoch,
                   with_label=True,
                   tensorboard=tensorboard,
                   glue_format_on=args.glue_format_on,
                   test_on=False,
                   device=device,
                   logger=logger)
        evaluation(model,
                   args.test_datasets,
                   test_data_list,
                   task_defs,
                   output_dir,
                   epoch,
                   with_label=False,
                   tensorboard=tensorboard,
                   glue_format_on=args.glue_format_on,
                   test_on=True,
                   device=device,
                   logger=logger)
        print_message(logger, '[new test scores at {} saved.]'.format(epoch))
        if args.local_rank in [-1, 0]:
            model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
            model.save(model_file)
    if args.tensorboard:
        tensorboard.close()
Пример #24
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    task_types = []
    dropout_list = []
    loss_types = []
    kd_loss_types = []

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = task_defs.task_type_map[prefix]

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix],
                                    opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)
        task_types.append(task_type)
        loss_types.append(task_defs.loss_map[prefix])
        kd_loss_types.append(task_defs.kd_loss_map[prefix])

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_type=task_type,
                                           data_type=data_type)
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets,
                                                     args.batch_size,
                                                     args.mix_opt, args.ratio)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['answer_opt'] = decoder_opts
    opt['task_types'] = task_types
    opt['tasks_dropout_p'] = dropout_list
    opt['loss_types'] = loss_types
    opt['kd_loss_types'] = kd_loss_types

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False, encoder_type=encoder_type)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            task_defs.
            n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = task_defs.task_type_map[prefix]

        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_type=task_type,
                                             data_type=data_type)
            dev_data = DataLoader(dev_data_set,
                                  batch_size=args.batch_size_eval,
                                  collate_fn=test_collater.collate_fn,
                                  pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_type=task_type,
                                              data_type=data_type)
            test_data = DataLoader(test_data_set,
                                   batch_size=args.batch_size_eval,
                                   collate_fn=test_collater.collate_fn,
                                   pin_memory=args.cuda)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs *
                                            len(multi_task_train_data)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    bert_model_path = args.init_checkpoint
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path)
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error(
                'Could not find the init model!\n The parameters will be initialized randomly!'
            )
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    # tensorboard
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        start = datetime.now()

        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                args.cuda, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates *
                                        args.grad_accumulation_step
                                        ) == 0 or model.local_updates == 1:
                ramaining_time = str(
                    (datetime.now() - start) / (i + 1) *
                    (len(multi_task_train_data) - i - 1)).split('.')[0]
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = task_defs.global_map.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                        model,
                        dev_data,
                        metric_meta=task_defs.metric_meta_map[prefix],
                        use_cuda=args.cuda,
                        label_mapper=label_dict,
                        task_type=task_defs.task_type_map[prefix])
                for key, val in dev_metrics.items():
                    if args.tensorboard:
                        tensorboard.add_scalar('dev/{}/{}'.format(
                            dataset, key),
                                               val,
                                               global_step=epoch)
                    if isinstance(val, str):
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format(
                                dataset, epoch, key, val))
                    else:
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(
                                dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                        model,
                        test_data,
                        metric_meta=task_defs.metric_meta_map[prefix],
                        use_cuda=args.cuda,
                        with_label=False,
                        label_mapper=label_dict,
                        task_type=task_defs.task_type_map[prefix])
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_test_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
    if args.tensorboard:
        tensorboard.close()
Пример #25
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = task_defs.task_type_map[prefix]
        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix],
                                    opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path,
                                            True,
                                            pairwise=pw_task,
                                            maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              pairwise=pw_task,
                              data_type=data_type,
                              task_type=task_type,
                              encoder_type=encoder_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            task_defs.
            n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = task_defs.task_type_map[prefix]

        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path,
                                              False,
                                              pairwise=pw_task,
                                              maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda,
                                is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type,
                                encoder_type=encoder_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path,
                                               False,
                                               pairwise=pw_task,
                                               maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda,
                                 is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type,
                                 encoder_type=encoder_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]

    # div number of grad accumulation.
    num_all_batches = args.epochs * sum(
        all_lens) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs * sum(all_lens)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) *
                                             (1 + args.ratio)))

    bert_model_path = args.init_checkpoint
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path)
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error(
                'Could not find the init model!\n The parameters will be initialized randomly!'
            )
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and args.ratio > 0:
            main_indices = [0] * len(train_data_list[0])
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks = int(
                min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices,
                                             random_picks,
                                             replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates *
                                        args.grad_accumulation_step
                                        ) == 0 or model.local_updates == 1:
                ramaining_time = str((datetime.now() - start) / (i + 1) *
                                     (len(all_indices) - i - 1)).split('.')[0]
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = task_defs.global_map.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                    model,
                    dev_data,
                    metric_meta=task_defs.metric_meta_map[prefix],
                    use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    logger.warning(
                        'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(
                            dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                    model,
                    test_data,
                    metric_meta=task_defs.metric_meta_map[prefix],
                    use_cuda=args.cuda,
                    with_label=False)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
Пример #26
0
def main():
    parser = argparse.ArgumentParser()
    model_config(parser)
    set_config(parser)
    train_config(parser)
    args = parser.parse_args()
    encoder_type = args.encoder_type
    layer_indexes = [int(x) for x in args.layers.split(",")]
    set_environment(args.seed)
    # process data
    data, is_single_sentence = process_data(args)
    data_type = (DataFormat.PremiseOnly
                 if is_single_sentence else DataFormat.PremiseAndOneHypothesis)
    fout_temp = "{}.tmp".format(args.finput)
    dump_data(data, fout_temp)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    dataset = SingleTaskDataset(
        fout_temp,
        False,
        maxlen=args.max_seq_length,
    )
    batcher = DataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=collater.collate_fn,
        pin_memory=args.cuda,
    )
    opt = vars(args)
    # load model
    if os.path.exists(args.checkpoint):
        state_dict = torch.load(args.checkpoint)
        config = state_dict["config"]
        config["dump_feature"] = True
        opt.update(config)
    else:
        logger.error("#" * 20)
        logger.error(
            "Could not find the init model!\n The parameters will be initialized randomly!"
        )
        logger.error("#" * 20)
        return
    num_all_batches = len(batcher)
    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.cuda:
        model.cuda()

    features_dict = {}
    for batch_meta, batch_data in batcher:
        batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta,
                                                     batch_data)
        all_encoder_layers, _ = model.extract(batch_meta, batch_data)
        embeddings = [
            all_encoder_layers[idx].detach().cpu().numpy()
            for idx in layer_indexes
        ]

        uids = batch_meta["uids"]
        masks = batch_data[batch_meta["mask"]].detach().cpu().numpy().tolist()
        for idx, uid in enumerate(uids):
            slen = sum(masks[idx])
            features = {}
            for yidx, layer in enumerate(layer_indexes):
                features[layer] = str(embeddings[yidx][idx][:slen].tolist())
            features_dict[uid] = features

    # save features
    with open(args.foutput, "w", encoding="utf-8") as writer:
        for sample in data:
            uid = sample["uid"]
            feature = features_dict[uid]
            feature["uid"] = uid
            writer.write("{}\n".format(json.dumps(feature)))