Пример #1
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with graph-based parsing')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=200,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of RNN')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    args_parser.add_argument('--pos',
                             action='store_true',
                             help='use part-of-speech embedding.')
    args_parser.add_argument('--char',
                             action='store_true',
                             help='use character embedding and CNN.')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--decode',
                             choices=['mst', 'greedy'],
                             help='decoding algorithm',
                             required=True)
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.01,
                             help='Learning rate')
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.05,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn',
                             nargs=2,
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument('--word_embedding',
                             choices=['glove', 'senna', 'sskip', 'polyglot'],
                             help='Embedding for words',
                             required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)

    args = args_parser.parse_args()

    logger = get_logger("GraphParser")

    mode = args.mode
    obj = args.objective
    decoding = args.decode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    model_path = args.model_path
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    punctuation = args.punctuation

    freeze = args.freeze
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    char_path = args.char_path

    use_pos = args.pos
    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        max_vocabulary_size=50000,
        embedd_dict=word_dict)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conllx_data.read_data_to_variable(train_path,
                                                   word_alphabet,
                                                   char_alphabet,
                                                   pos_alphabet,
                                                   type_alphabet,
                                                   use_gpu=use_gpu,
                                                   symbolic_root=True)
    # data_train = conllx_data.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    # num_data = sum([len(bucket) for bucket in data_train])
    num_data = sum(data_train[1])

    data_dev = conllx_data.read_data_to_variable(dev_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 pos_alphabet,
                                                 type_alphabet,
                                                 use_gpu=use_gpu,
                                                 volatile=True,
                                                 symbolic_root=True)
    data_test = conllx_data.read_data_to_variable(test_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  type_alphabet,
                                                  use_gpu=use_gpu,
                                                  volatile=True,
                                                  symbolic_root=True)

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.zeros([1, word_dim]).astype(
            np.float32) if freeze else np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                embedding = np.zeros([1, word_dim]).astype(
                    np.float32) if freeze else np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    window = 3
    if obj == 'cross_entropy':
        network = BiRecurrentConvBiAffine(word_dim,
                                          num_words,
                                          char_dim,
                                          num_chars,
                                          pos_dim,
                                          num_pos,
                                          num_filters,
                                          window,
                                          mode,
                                          hidden_size,
                                          num_layers,
                                          num_types,
                                          arc_space,
                                          type_space,
                                          embedd_word=word_table,
                                          embedd_char=char_table,
                                          p_in=p_in,
                                          p_out=p_out,
                                          p_rnn=p_rnn,
                                          biaffine=True,
                                          pos=use_pos,
                                          char=use_char)
    elif obj == 'crf':
        raise NotImplementedError
    else:
        raise RuntimeError('Unknown objective: %s' % obj)

    if freeze:
        network.word_embedd.freeze()

    if use_gpu:
        network.cuda()

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)

    def generate_optimizer(opt, lr, params):
        params = filter(lambda param: param.requires_grad, params)
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adamax':
            return Adamax(params,
                          lr=lr,
                          betas=betas,
                          weight_decay=gamma,
                          eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters())
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    logger.info(
        "RNN: %s, num_layer=%d, hidden=%d, arc_space=%d, type_space=%d" %
        (mode, num_layers, hidden_size, arc_space, type_space))
    logger.info(
        "train: obj: %s, l2: %f, (#data: %d, batch: %d, clip: %.2f, unk replace: %.2f)"
        % (obj, gamma, num_data, batch_size, clip, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info("decoding algorithm: %s" % decoding)
    logger.info(opt_info)

    num_batches = num_data / batch_size + 1
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    patient = 0
    decay = 0
    max_decay = 9
    double_schedule_decay = 5
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, pos, heads, types, masks, lengths = conllx_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss_arc, loss_type = network.loss(word,
                                               char,
                                               pos,
                                               heads,
                                               types,
                                               mask=masks,
                                               length=lengths)
            loss = loss_arc + loss_type
            loss.backward()
            clip_grad_norm(network.parameters(), clip)
            optim.step()

            num_inst = word.size(
                0) if obj == 'crf' else masks.data.sum() - word.size(0)
            train_err += loss.data[0] * num_inst
            train_err_arc += loss_arc.data[0] * num_inst
            train_err_type += loss_type.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 10 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, arc: %.4f, type: %.4f, time left: %.2fs' % (
                    batch, num_batches, train_err / train_total, train_err_arc
                    / train_total, train_err_type / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print(
            'train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs' %
            (num_batches, train_err / train_total, train_err_arc / train_total,
             train_err_type / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch)
        pred_writer.start(pred_filename)
        gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch)
        gold_writer.start(gold_filename)

        dev_ucorr = 0.0
        dev_lcorr = 0.0
        dev_total = 0
        dev_ucomlpete = 0.0
        dev_lcomplete = 0.0
        dev_ucorr_nopunc = 0.0
        dev_lcorr_nopunc = 0.0
        dev_total_nopunc = 0
        dev_ucomlpete_nopunc = 0.0
        dev_lcomplete_nopunc = 0.0
        dev_root_corr = 0.0
        dev_total_root = 0.0
        dev_total_inst = 0.0
        for batch in conllx_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, heads, types, masks, lengths = batch
            heads_pred, types_pred = decode(
                word,
                char,
                pos,
                mask=masks,
                length=lengths,
                leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
            word = word.data.cpu().numpy()
            pos = pos.data.cpu().numpy()
            lengths = lengths.cpu().numpy()
            heads = heads.data.cpu().numpy()
            types = types.data.cpu().numpy()

            pred_writer.write(word,
                              pos,
                              heads_pred,
                              types_pred,
                              lengths,
                              symbolic_root=True)
            gold_writer.write(word,
                              pos,
                              heads,
                              types,
                              lengths,
                              symbolic_root=True)

            stats, stats_nopunc, stats_root, num_inst = parser.eval(
                word,
                pos,
                heads_pred,
                types_pred,
                heads,
                types,
                word_alphabet,
                pos_alphabet,
                lengths,
                punct_set=punct_set,
                symbolic_root=True)
            ucorr, lcorr, total, ucm, lcm = stats
            ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
            corr_root, total_root = stats_root

            dev_ucorr += ucorr
            dev_lcorr += lcorr
            dev_total += total
            dev_ucomlpete += ucm
            dev_lcomplete += lcm

            dev_ucorr_nopunc += ucorr_nopunc
            dev_lcorr_nopunc += lcorr_nopunc
            dev_total_nopunc += total_nopunc
            dev_ucomlpete_nopunc += ucm_nopunc
            dev_lcomplete_nopunc += lcm_nopunc

            dev_root_corr += corr_root
            dev_total_root += total_root

            dev_total_inst += num_inst

        pred_writer.close()
        gold_writer.close()
        print(
            'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total,
               dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 /
               dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
        print(
            'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
               dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
               100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
               dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
        print('Root: corr: %d, total: %d, acc: %.2f%%' %
              (dev_root_corr, dev_total_root,
               dev_root_corr * 100 / dev_total_root))

        if dev_lcorrect_nopunc < dev_lcorr_nopunc or (
                dev_lcorrect_nopunc == dev_lcorr_nopunc
                and dev_ucorrect_nopunc < dev_ucorr_nopunc):
            dev_ucorrect_nopunc = dev_ucorr_nopunc
            dev_lcorrect_nopunc = dev_lcorr_nopunc
            dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
            dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

            dev_ucorrect = dev_ucorr
            dev_lcorrect = dev_lcorr
            dev_ucomlpete_match = dev_ucomlpete
            dev_lcomplete_match = dev_lcomplete

            dev_root_correct = dev_root_corr

            best_epoch = epoch
            patient = 0
            # torch.save(network, model_name)
            torch.save(network.state_dict(), model_name)

            pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch)
            pred_writer.start(pred_filename)
            gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch)
            gold_writer.start(gold_filename)

            test_ucorrect = 0.0
            test_lcorrect = 0.0
            test_ucomlpete_match = 0.0
            test_lcomplete_match = 0.0
            test_total = 0

            test_ucorrect_nopunc = 0.0
            test_lcorrect_nopunc = 0.0
            test_ucomlpete_match_nopunc = 0.0
            test_lcomplete_match_nopunc = 0.0
            test_total_nopunc = 0
            test_total_inst = 0

            test_root_correct = 0.0
            test_total_root = 0
            for batch in conllx_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, pos, heads, types, masks, lengths = batch
                heads_pred, types_pred = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                word = word.data.cpu().numpy()
                pos = pos.data.cpu().numpy()
                lengths = lengths.cpu().numpy()
                heads = heads.data.cpu().numpy()
                types = types.data.cpu().numpy()

                pred_writer.write(word,
                                  pos,
                                  heads_pred,
                                  types_pred,
                                  lengths,
                                  symbolic_root=True)
                gold_writer.write(word,
                                  pos,
                                  heads,
                                  types,
                                  lengths,
                                  symbolic_root=True)

                stats, stats_nopunc, stats_root, num_inst = parser.eval(
                    word,
                    pos,
                    heads_pred,
                    types_pred,
                    heads,
                    types,
                    word_alphabet,
                    pos_alphabet,
                    lengths,
                    punct_set=punct_set,
                    symbolic_root=True)
                ucorr, lcorr, total, ucm, lcm = stats
                ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                corr_root, total_root = stats_root

                test_ucorrect += ucorr
                test_lcorrect += lcorr
                test_total += total
                test_ucomlpete_match += ucm
                test_lcomplete_match += lcm

                test_ucorrect_nopunc += ucorr_nopunc
                test_lcorrect_nopunc += lcorr_nopunc
                test_total_nopunc += total_nopunc
                test_ucomlpete_match_nopunc += ucm_nopunc
                test_lcomplete_match_nopunc += lcm_nopunc

                test_root_correct += corr_root
                test_total_root += total_root

                test_total_inst += num_inst

            pred_writer.close()
            gold_writer.close()
        else:
            if dev_ucorr_nopunc * 100 / dev_total_nopunc < dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5 or patient >= schedule:
                # network = torch.load(model_name)
                network.load_state_dict(torch.load(model_name))
                lr = lr * decay_rate
                optim = generate_optimizer(opt, lr, network.parameters())

                if decoding == 'greedy':
                    decode = network.decode
                elif decoding == 'mst':
                    decode = network.decode_mst
                else:
                    raise ValueError('Unknown decoding algorithm: %s' %
                                     decoding)

                patient = 0
                decay += 1
                if decay % double_schedule_decay == 0:
                    schedule *= 2
            else:
                patient += 1

        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect, dev_lcorrect, dev_total,
               dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total,
               dev_ucomlpete_match * 100 / dev_total_inst,
               dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
        print(
            'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
               dev_ucorrect_nopunc * 100 / dev_total_nopunc,
               dev_lcorrect_nopunc * 100 / dev_total_nopunc,
               dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
               dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch))
        print('best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (dev_root_correct, dev_total_root,
               dev_root_correct * 100 / dev_total_root, best_epoch))
        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 /
               test_total, test_lcorrect * 100 / test_total,
               test_ucomlpete_match * 100 / test_total_inst,
               test_lcomplete_match * 100 / test_total_inst, best_epoch))
        print(
            'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            %
            (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
             test_ucorrect_nopunc * 100 / test_total_nopunc,
             test_lcorrect_nopunc * 100 / test_total_nopunc,
             test_ucomlpete_match_nopunc * 100 / test_total_inst,
             test_lcomplete_match_nopunc * 100 / test_total_inst, best_epoch))
        print('best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (test_root_correct, test_total_root,
               test_root_correct * 100 / test_total_root, best_epoch))
        print(
            '============================================================================================================================'
        )

        if decay == max_decay:
            break
                                  p_in=kwargs['p_in'],
                                  p_out=kwargs['p_out'],
                                  p_rnn=kwargs['p_rnn'],
                                  biaffine=kwargs['biaffine'],
                                  pos=kwargs['pos'],
                                  char=kwargs['char'])

char_alphabet.load(pretrained_dependency_parser_vocab_fn)
pos_alphabet.load(pretrained_dependency_parser_vocab_fn)
type_alphabet.load(pretrained_dependency_parser_vocab_fn)
word_alphabet.load(pretrained_dependency_parser_vocab_fn)

# char_alphabet.keep_growing = True
# word_alphabet.keep_growing = True

network.load_state_dict(parser)

#------------------------create data specific alphabets

print("Creating graph classifier Alphabets")
alphabet_path = os.path.join(custom_args.graph_alphabet_folder,
                             custom_args.dataset_name)
train_path, dev_path, test_path, _, _, _ = datasets.DATASET_FILES[
    custom_args.dataset_name]
if custom_args.bio_embeddings != 'none':
    if 'bio' in custom_args.bio_embeddings:
        embedding_vocab_dict = datasets.load_bio_word_embedding_vocab(
            custom_args.bio_embeddings)
    elif 'glove' in custom_args.bio_embeddings:
        embedding_vocab_dict = datasets.load_glove_word_embedding_vocab(
            custom_args.bio_embeddings)
Пример #3
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with graph-based parsing')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--cuda', action='store_true', help='using GPU')
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=200,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of RNN')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    args_parser.add_argument('--pos',
                             action='store_true',
                             help='use part-of-speech embedding.')
    args_parser.add_argument('--char',
                             action='store_true',
                             help='use character embedding and CNN.')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--decode',
                             choices=['mst', 'greedy'],
                             help='decoding algorithm',
                             required=True)
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.01,
                             help='Learning rate')
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.05,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn',
                             nargs=2,
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument('--word_embedding',
                             choices=['glove', 'senna', 'sskip', 'polyglot'],
                             help='Embedding for words',
                             required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)

    args_parser.add_argument('--seq2seq_save_path',
                             default='models/seq2seq/seq2seq_save_model',
                             type=str,
                             help='seq2seq_save_path')
    args_parser.add_argument('--network_save_path',
                             default='models/seq2seq/network_save_model',
                             type=str,
                             help='network_save_path')

    args_parser.add_argument('--seq2seq_load_path',
                             default='models/seq2seq/seq2seq_save_model',
                             type=str,
                             help='seq2seq_load_path')
    args_parser.add_argument('--network_load_path',
                             default='models/seq2seq/network_save_model',
                             type=str,
                             help='network_load_path')

    args_parser.add_argument('--rl_finetune_seq2seq_save_path',
                             default='models/rl_finetune/seq2seq_save_model',
                             type=str,
                             help='rl_finetune_seq2seq_save_path')
    args_parser.add_argument('--rl_finetune_network_save_path',
                             default='models/rl_finetune/network_save_model',
                             type=str,
                             help='rl_finetune_network_save_path')

    args_parser.add_argument('--rl_finetune_seq2seq_load_path',
                             default='models/rl_finetune/seq2seq_save_model',
                             type=str,
                             help='rl_finetune_seq2seq_load_path')
    args_parser.add_argument('--rl_finetune_network_load_path',
                             default='models/rl_finetune/network_save_model',
                             type=str,
                             help='rl_finetune_network_load_path')

    args = args_parser.parse_args()

    # args.train = "data/ptb/dev.conllu"
    # args.dev = "data/ptb/dev.conllu"
    # args.test = "data/ptb/dev.conllu"

    logger = get_logger("GraphParser")
    TEST_PARSER_FLAG = False  # TODO
    # SEED = 0
    # torch.manual_seed(SEED)
    # torch.cuda.manual_seed(SEED)

    mode = args.mode
    obj = args.objective
    decoding = args.decode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    model_path = args.model_path
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    punctuation = args.punctuation

    freeze = args.freeze
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    char_path = args.char_path

    use_pos = args.pos
    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)

    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        max_vocabulary_size=100000,
        embedd_dict=word_dict)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    device = torch.device(
        'cuda:0'
    )  #torch.device('cuda:0') if args.cuda else torch.device('cpu') #TODO:8.8

    data_train = conllx_data.read_data_to_tensor(train_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 pos_alphabet,
                                                 type_alphabet,
                                                 symbolic_root=True,
                                                 device=device)
    # data_train = conllx_data.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    # num_data = sum([len(bucket) for bucket in data_train])
    num_data = sum(data_train[1])

    data_dev = conllx_data.read_data_to_tensor(dev_path,
                                               word_alphabet,
                                               char_alphabet,
                                               pos_alphabet,
                                               type_alphabet,
                                               symbolic_root=True,
                                               device=device)
    data_test = conllx_data.read_data_to_tensor(test_path,
                                                word_alphabet,
                                                char_alphabet,
                                                pos_alphabet,
                                                type_alphabet,
                                                symbolic_root=True,
                                                device=device)

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.zeros([1, word_dim]).astype(
            np.float32) if freeze else np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                embedding = np.zeros([1, word_dim]).astype(
                    np.float32) if freeze else np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    # Pretrain structure prediction model (biaff model). model name: network
    window = 3
    if obj == 'cross_entropy':
        network = BiRecurrentConvBiAffine(word_dim,
                                          num_words,
                                          char_dim,
                                          num_chars,
                                          pos_dim,
                                          num_pos,
                                          num_filters,
                                          window,
                                          mode,
                                          hidden_size,
                                          num_layers,
                                          num_types,
                                          arc_space,
                                          type_space,
                                          embedd_word=word_table,
                                          embedd_char=char_table,
                                          p_in=p_in,
                                          p_out=p_out,
                                          p_rnn=p_rnn,
                                          biaffine=True,
                                          pos=use_pos,
                                          char=use_char)
    elif obj == 'crf':
        raise NotImplementedError
    else:
        raise RuntimeError('Unknown objective: %s' % obj)

    def save_args():
        arg_path = model_name + '.arg.json'
        arguments = [
            word_dim, num_words, char_dim, num_chars, pos_dim, num_pos,
            num_filters, window, mode, hidden_size, num_layers, num_types,
            arc_space, type_space
        ]
        kwargs = {
            'p_in': p_in,
            'p_out': p_out,
            'p_rnn': p_rnn,
            'biaffine': True,
            'pos': use_pos,
            'char': use_char
        }
        json.dump({
            'args': arguments,
            'kwargs': kwargs
        },
                  open(arg_path, 'w'),
                  indent=4)

    if freeze:
        freeze_embedding(network.word_embedd)

    network = network.to(device)

    save_args()

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)

    def generate_optimizer(opt, lr, params):
        params = filter(lambda param: param.requires_grad, params)
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adamax':
            return Adamax(params,
                          lr=lr,
                          betas=betas,
                          weight_decay=gamma,
                          eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters())
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    logger.info(
        "RNN: %s, num_layer=%d, hidden=%d, arc_space=%d, type_space=%d" %
        (mode, num_layers, hidden_size, arc_space, type_space))
    logger.info(
        "train: obj: %s, l2: %f, (#data: %d, batch: %d, clip: %.2f, unk replace: %.2f)"
        % (obj, gamma, num_data, batch_size, clip, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info("decoding algorithm: %s" % decoding)
    logger.info(opt_info)

    num_batches = num_data / batch_size + 1
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    print('Pretrain biaffine model.')
    patient = 0
    decay = 0
    max_decay = 9
    double_schedule_decay = 5
    num_epochs = 1  # debug hanwj
    network.load_state_dict(
        torch.load('models/parsing/biaffine/network.pt'))  # TODO: 7.13
    network.to(device)
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()
        FLAG = True
        for para in network.parameters():
            if FLAG:
                para.requires_grad = True
                FLAG = False
            else:
                para.requires_grad = False
        # network.state_dict().items()[0][1].requires_grad = True
        # ori_word_embedding = network.parameters().next().detach().data.cpu().numpy().T  # torch.Size([35374, 100])
        # c_ = -2 * np.dot(ori_word_embedding.T, ori_word_embedding)
        # a = np.sum(np.square(ori_word_embedding), axis=0).reshape((1, -1))
        # b = a.T
        # dist = a + b + c_
        # np.save('base_attach/dist_counter_35374.npy', dist)
        outf = 'base_attach/adv_sentences.txt'
        wf = codecs.open(outf, 'w', encoding='utf8')
        ori_outf = 'base_attach/ori_sentences.txt'
        ori_wf = codecs.open(ori_outf, 'w', encoding='utf8')
        ori_word_embedding = network.parameters().next().detach().data.cpu(
        ).numpy().T  # torch.Size([35374, 100])
        for batch in conllx_data.iterate_batch_tensor(data_dev, batch_size):
            word, char, pos, heads, types, masks, lengths = batch

            optim.zero_grad()
            loss_arc, loss_type = network.loss(word,
                                               char,
                                               pos,
                                               heads,
                                               types,
                                               mask=masks,
                                               length=lengths)
            loss = loss_arc  #+  loss_type
            loss = -loss
            loss.backward()
            clip_grad_norm_(network.parameters(), clip)
            optim.step()

            for batch_i in range(len(word)):
                ori_wf.write(' '.join([
                    str(word_alphabet.get_instance(wordi))
                    for wordi in word[batch_i][1:] if not wordi == 1
                ]))
                ori_wf.write('\n')
            # adv_word_embedding = network.parameters().next().detach().data.cpu().numpy().T  #torch.Size([35374, 100])
            for batch_i in range(len(word)):
                for batch_j in range(1, len(word[0])):
                    one_word = word[batch_i][batch_j].item()
                    if one_word == 1:
                        continue
                    src_word_idx = one_word
                    adv_vector = network.parameters().next().detach().data.cpu(
                    ).numpy()[src_word_idx, :]
                    ori_word_embedding[:, src_word_idx] = adv_vector
                    c_ = -2 * np.dot(ori_word_embedding.T, adv_vector)
                    a = np.sum(np.square(ori_word_embedding),
                               axis=0)  #.reshape((1, -1))
                    b = np.sum(np.square(adv_vector))
                    dist = a + c_ + b
                    neighbours, _ = glove_utils.pick_most_similar_words_from_vector(
                        src_word_idx,
                        adv_vector,
                        dist,
                        ret_count=1,
                        threshold=5)
                    word[batch_i][batch_j] = neighbours[0]
            for batch_i in range(len(word)):
                wf.write(' '.join([
                    str(word_alphabet.get_instance(wordi))
                    for wordi in word[batch_i][1:] if not wordi == 1
                ]))
                wf.write('\n')
        wf.close()
        ori_wf.close()
        #     with torch.no_grad():
        #         num_inst = word.size(0) if obj == 'crf' else masks.sum() - word.size(0)
        #         train_err += loss * num_inst
        #         train_err_arc += loss_arc * num_inst
        #         train_err_type += loss_type * num_inst
        #         train_total += num_inst
        #
        #    time_ave = (time.time() - start_time) / batch
        #     time_left = (num_batches - batch) * time_ave
        #
        #     # update log
        #     if batch % 10 == 0:
        #         sys.stdout.write("\b" * num_back)
        #         sys.stdout.write(" " * num_back)
        #         sys.stdout.write("\b" * num_back)
        #         log_info = 'train: %d/%d loss: %.4f, arc: %.4f, type: %.4f, time left: %.2fs' % (batch, num_batches, train_err / train_total,
        #                                                                                          train_err_arc / train_total, train_err_type / train_total, time_left)
        #         sys.stdout.write(log_info)
        #         sys.stdout.flush()
        #         num_back = len(log_info)
        #
        # sys.stdout.write("\b" * num_back)
        # sys.stdout.write(" " * num_back)
        # sys.stdout.write("\b" * num_back)
        # print('train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs' % (num_batches, train_err / train_total,
        #                                                                     train_err_arc / train_total, train_err_type / train_total, time.time() - start_time))

        # evaluate performance on dev data
        if not TEST_PARSER_FLAG:
            continue
        with torch.no_grad():
            network.eval()
            pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch)
            pred_writer.start(pred_filename)
            gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch)
            gold_writer.start(gold_filename)

            dev_ucorr = 0.0
            dev_lcorr = 0.0
            dev_total = 0
            dev_ucomlpete = 0.0
            dev_lcomplete = 0.0
            dev_ucorr_nopunc = 0.0
            dev_lcorr_nopunc = 0.0
            dev_total_nopunc = 0
            dev_ucomlpete_nopunc = 0.0
            dev_lcomplete_nopunc = 0.0
            dev_root_corr = 0.0
            dev_total_root = 0.0
            dev_total_inst = 0.0
            for batch in conllx_data.iterate_batch_tensor(
                    data_dev, batch_size):
                word, char, pos, heads, types, masks, lengths = batch
                heads_pred, types_pred = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                word = word.cpu().numpy()
                pos = pos.cpu().numpy()
                lengths = lengths.cpu().numpy()
                heads = heads.cpu().numpy()
                types = types.cpu().numpy()

                pred_writer.write(word,
                                  pos,
                                  heads_pred,
                                  types_pred,
                                  lengths,
                                  symbolic_root=True)
                gold_writer.write(word,
                                  pos,
                                  heads,
                                  types,
                                  lengths,
                                  symbolic_root=True)

                stats, stats_nopunc, stats_root, num_inst = parser.eval(
                    word,
                    pos,
                    heads_pred,
                    types_pred,
                    heads,
                    types,
                    word_alphabet,
                    pos_alphabet,
                    lengths,
                    punct_set=punct_set,
                    symbolic_root=True)
                ucorr, lcorr, total, ucm, lcm = stats
                ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                corr_root, total_root = stats_root

                dev_ucorr += ucorr
                dev_lcorr += lcorr
                dev_total += total
                dev_ucomlpete += ucm
                dev_lcomplete += lcm

                dev_ucorr_nopunc += ucorr_nopunc
                dev_lcorr_nopunc += lcorr_nopunc
                dev_total_nopunc += total_nopunc
                dev_ucomlpete_nopunc += ucm_nopunc
                dev_lcomplete_nopunc += lcm_nopunc

                dev_root_corr += corr_root
                dev_total_root += total_root

                dev_total_inst += num_inst

            pred_writer.close()
            gold_writer.close()
            print(
                'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 /
                   dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete *
                   100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
            print(
                'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                %
                (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
                 dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
                 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
                 dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
            print('Root: corr: %d, total: %d, acc: %.2f%%' %
                  (dev_root_corr, dev_total_root,
                   dev_root_corr * 100 / dev_total_root))

            if dev_lcorrect_nopunc < dev_lcorr_nopunc or (
                    dev_lcorrect_nopunc == dev_lcorr_nopunc
                    and dev_ucorrect_nopunc < dev_ucorr_nopunc):
                dev_ucorrect_nopunc = dev_ucorr_nopunc
                dev_lcorrect_nopunc = dev_lcorr_nopunc
                dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
                dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

                dev_ucorrect = dev_ucorr
                dev_lcorrect = dev_lcorr
                dev_ucomlpete_match = dev_ucomlpete
                dev_lcomplete_match = dev_lcomplete

                dev_root_correct = dev_root_corr

                best_epoch = epoch
                patient = 0
                # torch.save(network, model_name)
                torch.save(network.state_dict(), model_name)

                pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch)
                pred_writer.start(pred_filename)
                gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch)
                gold_writer.start(gold_filename)

                test_ucorrect = 0.0
                test_lcorrect = 0.0
                test_ucomlpete_match = 0.0
                test_lcomplete_match = 0.0
                test_total = 0

                test_ucorrect_nopunc = 0.0
                test_lcorrect_nopunc = 0.0
                test_ucomlpete_match_nopunc = 0.0
                test_lcomplete_match_nopunc = 0.0
                test_total_nopunc = 0
                test_total_inst = 0

                test_root_correct = 0.0
                test_total_root = 0
                for batch in conllx_data.iterate_batch_tensor(
                        data_test, batch_size):
                    word, char, pos, heads, types, masks, lengths = batch
                    heads_pred, types_pred = decode(
                        word,
                        char,
                        pos,
                        mask=masks,
                        length=lengths,
                        leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                    word = word.cpu().numpy()
                    pos = pos.cpu().numpy()
                    lengths = lengths.cpu().numpy()
                    heads = heads.cpu().numpy()
                    types = types.cpu().numpy()

                    pred_writer.write(word,
                                      pos,
                                      heads_pred,
                                      types_pred,
                                      lengths,
                                      symbolic_root=True)
                    gold_writer.write(word,
                                      pos,
                                      heads,
                                      types,
                                      lengths,
                                      symbolic_root=True)

                    stats, stats_nopunc, stats_root, num_inst = parser.eval(
                        word,
                        pos,
                        heads_pred,
                        types_pred,
                        heads,
                        types,
                        word_alphabet,
                        pos_alphabet,
                        lengths,
                        punct_set=punct_set,
                        symbolic_root=True)
                    ucorr, lcorr, total, ucm, lcm = stats
                    ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                    corr_root, total_root = stats_root

                    test_ucorrect += ucorr
                    test_lcorrect += lcorr
                    test_total += total
                    test_ucomlpete_match += ucm
                    test_lcomplete_match += lcm

                    test_ucorrect_nopunc += ucorr_nopunc
                    test_lcorrect_nopunc += lcorr_nopunc
                    test_total_nopunc += total_nopunc
                    test_ucomlpete_match_nopunc += ucm_nopunc
                    test_lcomplete_match_nopunc += lcm_nopunc

                    test_root_correct += corr_root
                    test_total_root += total_root

                    test_total_inst += num_inst

                pred_writer.close()
                gold_writer.close()
            else:
                if dev_ucorr_nopunc * 100 / dev_total_nopunc < dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5 or patient >= schedule:
                    # network = torch.load(model_name)
                    network.load_state_dict(torch.load(model_name))
                    lr = lr * decay_rate
                    optim = generate_optimizer(opt, lr, network.parameters())

                    if decoding == 'greedy':
                        decode = network.decode
                    elif decoding == 'mst':
                        decode = network.decode_mst
                    else:
                        raise ValueError('Unknown decoding algorithm: %s' %
                                         decoding)

                    patient = 0
                    decay += 1
                    if decay % double_schedule_decay == 0:
                        schedule *= 2
                else:
                    patient += 1

            print(
                '----------------------------------------------------------------------------------------------------------------------------'
            )
            print(
                'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect, dev_lcorrect, dev_total, dev_ucorrect * 100 /
                   dev_total, dev_lcorrect * 100 / dev_total,
                   dev_ucomlpete_match * 100 / dev_total_inst,
                   dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
            print(
                'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
                   dev_ucorrect_nopunc * 100 / dev_total_nopunc,
                   dev_lcorrect_nopunc * 100 / dev_total_nopunc,
                   dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
                   dev_lcomplete_match_nopunc * 100 / dev_total_inst,
                   best_epoch))
            print(
                'best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (dev_root_correct, dev_total_root,
                   dev_root_correct * 100 / dev_total_root, best_epoch))
            print(
                '----------------------------------------------------------------------------------------------------------------------------'
            )
            print(
                'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (test_ucorrect, test_lcorrect, test_total, test_ucorrect *
                   100 / test_total, test_lcorrect * 100 / test_total,
                   test_ucomlpete_match * 100 / test_total_inst,
                   test_lcomplete_match * 100 / test_total_inst, best_epoch))
            print(
                'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                %
                (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
                 test_ucorrect_nopunc * 100 / test_total_nopunc,
                 test_lcorrect_nopunc * 100 / test_total_nopunc,
                 test_ucomlpete_match_nopunc * 100 / test_total_inst,
                 test_lcomplete_match_nopunc * 100 / test_total_inst,
                 best_epoch))
            print(
                'best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (test_root_correct, test_total_root,
                   test_root_correct * 100 / test_total_root, best_epoch))
            print(
                '============================================================================================================================'
            )

            if decay == max_decay:
                break

    print('Baseline Attack.')
def run_biaffine(model_path, model_name, test_path, punct_set, use_gpu, logger,
                 args):
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, \
    type_alphabet = conllx_data.create_alphabets(
        alphabet_path,
        None,
        data_paths=[None, None],
        max_vocabulary_size=50000,
        embedd_dict=None
    )

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    decoding = args.decode

    logger.info('use gpu: %s, decoding: %s' % (use_gpu, decoding))

    device = torch.device('cuda') if use_gpu else torch.device('cpu')

    data_test = aida_data.read_data_to_tensor(test_path,
                                              word_alphabet,
                                              char_alphabet,
                                              pos_alphabet,
                                              type_alphabet,
                                              symbolic_root=True,
                                              device=device)

    pred_writer = AIDAWriter(word_alphabet, char_alphabet, pos_alphabet,
                             type_alphabet)

    logger.info('model: %s' % model_name)

    def load_model_arguments_from_json():
        arguments = json.load(open(arg_path, 'r'))
        return arguments['args'], arguments['kwargs']

    arg_path = model_name + '.arg.json'
    model_args, model_kwargs = load_model_arguments_from_json()
    network = BiRecurrentConvBiAffine(*model_args, **model_kwargs)
    if torch.cuda.is_available():
        map_location = lambda storage, loc: storage.cuda()
    else:
        map_location = 'cpu'
    network.load_state_dict(torch.load(model_name, map_location=map_location))

    if use_gpu:
        network.cuda()
    else:
        network.cpu()

    network.eval()

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    pred_writer.start(args.output_path)
    sent = 0
    start_time = time.time()

    with torch.no_grad():
        for batch in aida_data.iterate_batch_tensor(data_test, 1):
            sys.stdout.write('Processing sentence: %d\n' % sent)
            sys.stdout.flush()
            sent += 1

            word, char, pos, _, _, masks, lengths, segment_ids_words = batch
            heads_pred, types_pred = decode(
                word,
                char,
                pos,
                mask=masks,
                length=lengths,
                leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
            word = word.data.cpu().numpy()
            pos = pos.data.cpu().numpy()
            lengths = lengths.cpu().numpy()
            segment_ids, segment_words = zip(*segment_ids_words)
            pred_writer.write(segment_ids,
                              segment_words,
                              word,
                              pos,
                              heads_pred,
                              types_pred,
                              lengths,
                              symbolic_root=True)

        pred_writer.close()

    print('\ntime: %.2fs' % (time.time() - start_time))
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with graph-based parsing')
    args_parser.register('type', 'bool', str2bool)

    args_parser.add_argument('--seed',
                             type=int,
                             default=1234,
                             help='random seed for reproducibility')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=1000,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of encoder.')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    args_parser.add_argument('--pos',
                             action='store_true',
                             help='use part-of-speech embedding.')
    args_parser.add_argument('--char',
                             action='store_true',
                             help='use character embedding and CNN.')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--decode',
                             choices=['mst', 'greedy'],
                             default='mst',
                             help='decoding algorithm')
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.01,
                             help='Learning rate')
    # args_parser.add_argument('--decay_rate', type=float, default=0.05, help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn',
                             nargs='+',
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    # args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument(
        '--word_embedding',
        choices=['word2vec', 'glove', 'senna', 'sskip', 'polyglot'],
        help='Embedding for words',
        required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument('--data_dir', help='Data directory path')
    args_parser.add_argument(
        '--src_lang',
        required=True,
        help='Src language to train dependency parsing model')
    args_parser.add_argument('--aux_lang',
                             nargs='+',
                             help='Language names for adversarial training')
    args_parser.add_argument('--vocab_path',
                             help='path for prebuilt alphabets.',
                             default=None)
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)
    #
    args_parser.add_argument('--attn_on_rnn',
                             action='store_true',
                             help='use self-attention on top of context RNN.')
    args_parser.add_argument('--no_word',
                             type='bool',
                             default=False,
                             help='do not use word embedding.')
    args_parser.add_argument('--use_bert',
                             type='bool',
                             default=False,
                             help='use multilingual BERT.')
    #
    # lrate schedule with warmup in the first iter.
    args_parser.add_argument('--use_warmup_schedule',
                             type='bool',
                             default=False,
                             help="Use warmup lrate schedule.")
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.75,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--max_decay',
                             type=int,
                             default=9,
                             help='Number of decays before stop')
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument('--double_schedule_decay',
                             type=int,
                             default=5,
                             help='Number of decays to double schedule')
    args_parser.add_argument(
        '--check_dev',
        type=int,
        default=5,
        help='Check development performance in every n\'th iteration')
    # encoder selection
    args_parser.add_argument('--encoder_type',
                             choices=['Transformer', 'RNN', 'SelfAttn'],
                             default='RNN',
                             help='do not use context RNN.')
    args_parser.add_argument(
        '--pool_type',
        default='mean',
        choices=['max', 'mean', 'weight'],
        help='pool type to form fixed length vector from word embeddings')
    # Tansformer encoder
    args_parser.add_argument(
        '--trans_hid_size',
        type=int,
        default=1024,
        help='#hidden units in point-wise feed-forward in transformer')
    args_parser.add_argument(
        '--d_k',
        type=int,
        default=64,
        help='d_k for multi-head-attention in transformer encoder')
    args_parser.add_argument(
        '--d_v',
        type=int,
        default=64,
        help='d_v for multi-head-attention in transformer encoder')
    args_parser.add_argument('--num_head',
                             type=int,
                             default=8,
                             help='Value of h in multi-head attention')
    args_parser.add_argument(
        '--use_all_encoder_layers',
        type='bool',
        default=False,
        help='Use a weighted representations of all encoder layers')
    # - positional
    args_parser.add_argument(
        '--enc_use_neg_dist',
        action='store_true',
        help="Use negative distance for enc's relational-distance embedding.")
    args_parser.add_argument(
        '--enc_clip_dist',
        type=int,
        default=0,
        help="The clipping distance for relative position features.")
    args_parser.add_argument('--position_dim',
                             type=int,
                             default=50,
                             help='Dimension of Position embeddings.')
    args_parser.add_argument(
        '--position_embed_num',
        type=int,
        default=200,
        help=
        'Minimum value of position embedding num, which usually is max-sent-length.'
    )
    args_parser.add_argument('--train_position',
                             action='store_true',
                             help='train positional encoding for transformer.')

    args_parser.add_argument('--input_concat_embeds',
                             action='store_true',
                             help="Concat input embeddings, otherwise add.")
    args_parser.add_argument('--input_concat_position',
                             action='store_true',
                             help="Concat position embeddings, otherwise add.")
    args_parser.add_argument(
        '--partitioned',
        type='bool',
        default=False,
        help=
        "Partition the content and positional attention for multi-head attention."
    )
    args_parser.add_argument(
        '--partition_type',
        choices=['content-position', 'lexical-delexical'],
        default='content-position',
        help="How to apply partition in the self-attention.")
    #
    args_parser.add_argument(
        '--train_len_thresh',
        type=int,
        default=100,
        help='In training, discard sentences longer than this.')

    #
    # regarding adversarial training
    args_parser.add_argument('--pre_model_path',
                             type=str,
                             default=None,
                             help='Path of the pretrained model.')
    args_parser.add_argument('--pre_model_name',
                             type=str,
                             default=None,
                             help='Name of the pretrained model.')
    args_parser.add_argument('--adv_training',
                             type='bool',
                             default=False,
                             help='Use adversarial training.')
    args_parser.add_argument(
        '--lambdaG',
        type=float,
        default=0.001,
        help='Scaling parameter to control generator loss.')
    args_parser.add_argument('--discriminator',
                             choices=['weak', 'not-so-weak', 'strong'],
                             default='weak',
                             help='architecture of the discriminator')
    args_parser.add_argument(
        '--delay',
        type=int,
        default=0,
        help='Number of epochs to be run first for the source task')
    args_parser.add_argument(
        '--n_critic',
        type=int,
        default=5,
        help='Number of training steps for discriminator per iter')
    args_parser.add_argument(
        '--clip_disc',
        type=float,
        default=5.0,
        help='Lower and upper clip value for disc. weights')
    args_parser.add_argument('--debug',
                             type='bool',
                             default=False,
                             help='Use debug portion of the training data')
    args_parser.add_argument('--train_level',
                             type=str,
                             default='word',
                             choices=['word', 'sent'],
                             help='Use X-level adversarial training')
    args_parser.add_argument('--train_type',
                             type=str,
                             default='GAN',
                             choices=['GR', 'GAN', 'WGAN'],
                             help='Type of adversarial training')
    #
    # regarding motivational training
    args_parser.add_argument(
        '--motivate',
        type='bool',
        default=False,
        help='This is opposite of the adversarial training')

    #
    args = args_parser.parse_args()

    # fix data-prepare seed
    random.seed(1234)
    np.random.seed(1234)
    # model's seed
    torch.manual_seed(args.seed)

    # if output directory doesn't exist, create it
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    logger = get_logger("GraphParser")

    logger.info('\ncommand-line params : {0}\n'.format(sys.argv[1:]))
    logger.info('{0}\n'.format(args))

    logger.info("Visible GPUs: %s", str(os.environ["CUDA_VISIBLE_DEVICES"]))
    args.parallel = False
    if torch.cuda.device_count() > 1:
        args.parallel = True

    mode = args.mode
    obj = args.objective
    decoding = args.decode

    train_path = args.data_dir + args.src_lang + "_train.debug.1_10.conllu" \
        if args.debug else args.data_dir + args.src_lang + '_train.conllu'
    dev_path = args.data_dir + args.src_lang + "_dev.conllu"
    test_path = args.data_dir + args.src_lang + "_test.conllu"

    #
    vocab_path = args.vocab_path if args.vocab_path is not None else args.model_path
    model_path = args.model_path
    model_name = args.model_name

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    punctuation = args.punctuation

    freeze = args.freeze
    use_word_emb = not args.no_word
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    char_path = args.char_path

    attn_on_rnn = args.attn_on_rnn
    encoder_type = args.encoder_type
    if attn_on_rnn:
        assert encoder_type == 'RNN'

    t_types = (args.adv_training, args.motivate)
    t_count = sum(1 for tt in t_types if tt)
    if t_count > 1:
        assert False, "Only one of: adv_training or motivate can be true"

    # ------------------- Loading/initializing embeddings -------------------- #

    use_pos = args.pos
    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(vocab_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)

    # TODO (WARNING): must build vocabs previously
    assert os.path.isdir(alphabet_path), "should have build vocabs previously"
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet, max_sent_length = conllx_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        max_vocabulary_size=50000,
        embedd_dict=word_dict)
    max_sent_length = max(max_sent_length, args.position_embed_num)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    # ------------------------------------------------------------------------- #
    # --------------------- Loading/building the model ------------------------ #

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.zeros([1, word_dim]).astype(
            np.float32) if freeze else np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                embedding = np.zeros([1, word_dim]).astype(
                    np.float32) if freeze else np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table() if use_word_emb else None
    char_table = construct_char_embedding_table() if use_char else None

    def load_model_arguments_from_json():
        arguments = json.load(open(pre_model_path, 'r'))
        return arguments['args'], arguments['kwargs']

    window = 3
    if obj == 'cross_entropy':
        if args.pre_model_path and args.pre_model_name:
            pre_model_name = os.path.join(args.pre_model_path,
                                          args.pre_model_name)
            pre_model_path = pre_model_name + '.arg.json'
            model_args, kwargs = load_model_arguments_from_json()

            network = BiRecurrentConvBiAffine(use_gpu=use_gpu,
                                              *model_args,
                                              **kwargs)
            network.load_state_dict(torch.load(pre_model_name))
            logger.info("Model reloaded from %s" % pre_model_path)

            # Adjust the word embedding layer
            if network.embedder.word_embedd is not None:
                network.embedder.word_embedd = nn.Embedding(num_words,
                                                            word_dim,
                                                            _weight=word_table)

        else:
            network = BiRecurrentConvBiAffine(
                word_dim,
                num_words,
                char_dim,
                num_chars,
                pos_dim,
                num_pos,
                num_filters,
                window,
                mode,
                hidden_size,
                num_layers,
                num_types,
                arc_space,
                type_space,
                embedd_word=word_table,
                embedd_char=char_table,
                p_in=p_in,
                p_out=p_out,
                p_rnn=p_rnn,
                biaffine=True,
                pos=use_pos,
                char=use_char,
                train_position=args.train_position,
                encoder_type=encoder_type,
                trans_hid_size=args.trans_hid_size,
                d_k=args.d_k,
                d_v=args.d_v,
                num_head=args.num_head,
                enc_use_neg_dist=args.enc_use_neg_dist,
                enc_clip_dist=args.enc_clip_dist,
                position_dim=args.position_dim,
                max_sent_length=max_sent_length,
                use_gpu=use_gpu,
                use_word_emb=use_word_emb,
                input_concat_embeds=args.input_concat_embeds,
                input_concat_position=args.input_concat_position,
                attn_on_rnn=attn_on_rnn,
                partitioned=args.partitioned,
                partition_type=args.partition_type,
                use_all_encoder_layers=args.use_all_encoder_layers,
                use_bert=args.use_bert)

    elif obj == 'crf':
        raise NotImplementedError
    else:
        raise RuntimeError('Unknown objective: %s' % obj)

    # ------------------------------------------------------------------------- #
    # --------------------- Loading data -------------------------------------- #

    train_data = dict()
    dev_data = dict()
    test_data = dict()
    num_data = dict()
    lang_ids = dict()
    reverse_lang_ids = dict()

    # ===== the reading =============================================
    def _read_one(path, is_train):
        lang_id = guess_language_id(path)
        logger.info("Reading: guess that the language of file %s is %s." %
                    (path, lang_id))
        one_data = conllx_data.read_data_to_variable(
            path,
            word_alphabet,
            char_alphabet,
            pos_alphabet,
            type_alphabet,
            use_gpu=False,
            volatile=(not is_train),
            symbolic_root=True,
            lang_id=lang_id,
            use_bert=args.use_bert,
            len_thresh=(args.train_len_thresh if is_train else 100000))
        return one_data

    data_train = _read_one(train_path, True)
    train_data[args.src_lang] = data_train
    num_data[args.src_lang] = sum(data_train[1])
    lang_ids[args.src_lang] = len(lang_ids)
    reverse_lang_ids[lang_ids[args.src_lang]] = args.src_lang

    data_dev = _read_one(dev_path, False)
    data_test = _read_one(test_path, False)
    dev_data[args.src_lang] = data_dev
    test_data[args.src_lang] = data_test

    # ===============================================================

    # ===== reading data for adversarial training ===================
    if t_count > 0:
        for language in args.aux_lang:
            aux_train_path = args.data_dir + language + "_train.debug.1_10.conllu" \
                if args.debug else args.data_dir + language + '_train.conllu'
            aux_train_data = _read_one(aux_train_path, True)
            num_data[language] = sum(aux_train_data[1])
            train_data[language] = aux_train_data
            lang_ids[language] = len(lang_ids)
            reverse_lang_ids[lang_ids[language]] = language
    # ===============================================================

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def save_args():
        arg_path = model_name + '.arg.json'
        arguments = [
            word_dim, num_words, char_dim, num_chars, pos_dim, num_pos,
            num_filters, window, mode, hidden_size, num_layers, num_types,
            arc_space, type_space
        ]
        kwargs = {
            'p_in': p_in,
            'p_out': p_out,
            'p_rnn': p_rnn,
            'biaffine': True,
            'pos': use_pos,
            'char': use_char,
            'train_position': args.train_position,
            'encoder_type': args.encoder_type,
            'trans_hid_size': args.trans_hid_size,
            'd_k': args.d_k,
            'd_v': args.d_v,
            'num_head': args.num_head,
            'enc_use_neg_dist': args.enc_use_neg_dist,
            'enc_clip_dist': args.enc_clip_dist,
            'position_dim': args.position_dim,
            'max_sent_length': max_sent_length,
            'use_word_emb': use_word_emb,
            'input_concat_embeds': args.input_concat_embeds,
            'input_concat_position': args.input_concat_position,
            'attn_on_rnn': attn_on_rnn,
            'partitioned': args.partitioned,
            'partition_type': args.partition_type,
            'use_all_encoder_layers': args.use_all_encoder_layers,
            'use_bert': args.use_bert
        }
        json.dump({
            'args': arguments,
            'kwargs': kwargs
        },
                  open(arg_path, 'w'),
                  indent=4)

    if use_word_emb and freeze:
        freeze_embedding(network.embedder.word_embedd)

    if args.parallel:
        network = torch.nn.DataParallel(network)

    if use_gpu:
        network = network.cuda()

    save_args()

    param_dict = {}
    encoder = network.module.encoder if args.parallel else network.encoder
    for name, param in encoder.named_parameters():
        if param.requires_grad:
            param_dict[name] = np.prod(param.size())

    total_params = np.sum(list(param_dict.values()))
    logger.info('Total Encoder Parameters = %d' % total_params)

    # ------------------------------------------------------------------------- #

    # =============================================
    if args.adv_training:
        disc_feat_size = network.module.encoder.output_dim if args.parallel else network.encoder.output_dim
        reverse_grad = args.train_type == 'GR'
        nclass = len(lang_ids) if args.train_type == 'GR' else 1

        kwargs = {
            'input_size': disc_feat_size,
            'disc_type': args.discriminator,
            'train_level': args.train_level,
            'train_type': args.train_type,
            'reverse_grad': reverse_grad,
            'soft_label': True,
            'nclass': nclass,
            'scale': args.lambdaG,
            'use_gpu': use_gpu,
            'opt': 'adam',
            'lr': 0.001,
            'betas': (0.9, 0.999),
            'gamma': 0,
            'eps': 1e-8,
            'momentum': 0,
            'clip_disc': args.clip_disc
        }
        AdvAgent = Adversarial(**kwargs)
        if use_gpu:
            AdvAgent.cuda()

    elif args.motivate:
        disc_feat_size = network.module.encoder.output_dim if args.parallel else network.encoder.output_dim
        nclass = len(lang_ids)

        kwargs = {
            'input_size': disc_feat_size,
            'disc_type': args.discriminator,
            'train_level': args.train_level,
            'nclass': nclass,
            'scale': args.lambdaG,
            'use_gpu': use_gpu,
            'opt': 'adam',
            'lr': 0.001,
            'betas': (0.9, 0.999),
            'gamma': 0,
            'eps': 1e-8,
            'momentum': 0,
            'clip_disc': args.clip_disc
        }
        MtvAgent = Motivator(**kwargs)
        if use_gpu:
            MtvAgent.cuda()

    # =============================================

    # --------------------- Initializing the optimizer ------------------------ #

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters(), betas, gamma,
                               eps, momentum)
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    # =============================================

    total_data = min(num_data.values())

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    logger.info(
        "RNN: %s, num_layer=%d, hidden=%d, arc_space=%d, type_space=%d" %
        (mode, num_layers, hidden_size, arc_space, type_space))
    logger.info(
        "train: obj: %s, l2: %f, (#data: %d, batch: %d, clip: %.2f, unk replace: %.2f)"
        % (obj, gamma, total_data, batch_size, clip, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info("decoding algorithm: %s" % decoding)
    logger.info(opt_info)

    # ------------------------------------------------------------------------- #
    # --------------------- Form the mini-batches ----------------------------- #
    num_batches = total_data // batch_size + 1
    aux_lang = []
    if t_count > 0:
        for language in args.aux_lang:
            aux_lang.extend([language] * num_data[language])

        assert num_data[args.src_lang] <= len(aux_lang)
    # ------------------------------------------------------------------------- #

    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    if decoding == 'greedy':
        decode = network.module.decode if args.parallel else network.decode
    elif decoding == 'mst':
        decode = network.module.decode_mst if args.parallel else network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    patient = 0
    decay = 0
    max_decay = args.max_decay
    double_schedule_decay = args.double_schedule_decay

    # lrate schedule
    step_num = 0
    use_warmup_schedule = args.use_warmup_schedule

    if use_warmup_schedule:
        logger.info("Use warmup lrate for the first epoch, from 0 up to %s." %
                    (lr, ))

    skip_adv_tuning = 0
    loss_fn = network.module.loss if args.parallel else network.loss
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0

        skip_adv_tuning += 1
        loss_d_real, loss_d_fake = [], []
        acc_d_real, acc_d_fake, = [], []
        gen_loss, parsing_loss = [], []
        disent_loss = []

        if t_count > 0 and skip_adv_tuning > args.delay:
            batch_size = args.batch_size // 2
            num_batches = total_data // batch_size + 1

        # ---------------------- Sample the mini-batches -------------------------- #
        if t_count > 0:
            sampled_aux_lang = random.sample(aux_lang, num_batches)
            lang_in_batch = [(args.src_lang, sampled_aux_lang[k])
                             for k in range(num_batches)]
        else:
            lang_in_batch = [(args.src_lang, None) for _ in range(num_batches)]
        assert len(lang_in_batch) == num_batches
        # ------------------------------------------------------------------------- #

        network.train()
        warmup_factor = (lr + 0.) / num_batches
        for batch in range(1, num_batches + 1):
            update_generator = True
            update_discriminator = False

            # lrate schedule (before each step)
            step_num += 1
            if use_warmup_schedule and epoch <= 1:
                cur_lrate = warmup_factor * step_num
                # set lr
                for param_group in optim.param_groups:
                    param_group['lr'] = cur_lrate

            # considering source language as real and auxiliary languages as fake
            real_lang, fake_lang = lang_in_batch[batch - 1]
            real_idx, fake_idx = lang_ids.get(real_lang), lang_ids.get(
                fake_lang, -1)

            #
            word, char, pos, heads, types, masks, lengths, bert_inputs = conllx_data.get_batch_variable(
                train_data[real_lang], batch_size, unk_replace=unk_replace)

            if use_gpu:
                word = word.cuda()
                char = char.cuda()
                pos = pos.cuda()
                heads = heads.cuda()
                types = types.cuda()
                masks = masks.cuda()
                lengths = lengths.cuda()
                if bert_inputs[0] is not None:
                    bert_inputs[0] = bert_inputs[0].cuda()
                    bert_inputs[1] = bert_inputs[1].cuda()
                    bert_inputs[2] = bert_inputs[2].cuda()

            real_enc = network(word,
                               char,
                               pos,
                               input_bert=bert_inputs,
                               mask=masks,
                               length=lengths,
                               hx=None)

            # ========== Update the discriminator ==========
            if t_count > 0 and skip_adv_tuning > args.delay:
                # fake examples = 0
                word_f, char_f, pos_f, heads_f, types_f, masks_f, lengths_f, bert_inputs = conllx_data.get_batch_variable(
                    train_data[fake_lang], batch_size, unk_replace=unk_replace)

                if use_gpu:
                    word_f = word_f.cuda()
                    char_f = char_f.cuda()
                    pos_f = pos_f.cuda()
                    heads_f = heads_f.cuda()
                    types_f = types_f.cuda()
                    masks_f = masks_f.cuda()
                    lengths_f = lengths_f.cuda()
                    if bert_inputs[0] is not None:
                        bert_inputs[0] = bert_inputs[0].cuda()
                        bert_inputs[1] = bert_inputs[1].cuda()
                        bert_inputs[2] = bert_inputs[2].cuda()

                fake_enc = network(word_f,
                                   char_f,
                                   pos_f,
                                   input_bert=bert_inputs,
                                   mask=masks_f,
                                   length=lengths_f,
                                   hx=None)

                # TODO: temporary crack
                if t_count > 0 and skip_adv_tuning > args.delay:
                    # skip discriminator training for '|n_critic|' iterations if 'n_critic' < 0
                    if args.n_critic > 0 or (batch - 1) % (-1 *
                                                           args.n_critic) == 0:
                        update_discriminator = True

            if update_discriminator:
                if args.adv_training:
                    real_loss, fake_loss, real_acc, fake_acc = AdvAgent.update(
                        real_enc['output'].detach(),
                        fake_enc['output'].detach(), real_idx, fake_idx)

                    loss_d_real.append(real_loss)
                    loss_d_fake.append(fake_loss)
                    acc_d_real.append(real_acc)
                    acc_d_fake.append(fake_acc)

                elif args.motivate:
                    real_loss, fake_loss, real_acc, fake_acc = MtvAgent.update(
                        real_enc['output'].detach(),
                        fake_enc['output'].detach(), real_idx, fake_idx)

                    loss_d_real.append(real_loss)
                    loss_d_fake.append(fake_loss)
                    acc_d_real.append(real_acc)
                    acc_d_fake.append(fake_acc)

                else:
                    raise NotImplementedError()

                if args.n_critic > 0 and (batch - 1) % args.n_critic != 0:
                    update_generator = False

            # ==============================================

            # =========== Update the generator =============
            if update_generator:
                others_loss = None
                if args.adv_training and skip_adv_tuning > args.delay:
                    # for GAN: L_G= L_parsing - (lambda_G * L_D)
                    # for GR : L_G= L_parsing +  L_D
                    others_loss = AdvAgent.gen_loss(real_enc['output'],
                                                    fake_enc['output'],
                                                    real_idx, fake_idx)
                    gen_loss.append(others_loss.item())

                elif args.motivate and skip_adv_tuning > args.delay:
                    others_loss = MtvAgent.gen_loss(real_enc['output'],
                                                    fake_enc['output'],
                                                    real_idx, fake_idx)
                    gen_loss.append(others_loss.item())

                optim.zero_grad()

                loss_arc, loss_type = loss_fn(real_enc['output'],
                                              heads,
                                              types,
                                              mask=masks,
                                              length=lengths)
                loss = loss_arc + loss_type

                num_inst = word.size(
                    0) if obj == 'crf' else masks.sum() - word.size(0)
                train_err += loss.item() * num_inst
                train_err_arc += loss_arc.item() * num_inst
                train_err_type += loss_type.item() * num_inst
                train_total += num_inst
                parsing_loss.append(loss.item())

                if others_loss is not None:
                    loss = loss + others_loss

                loss.backward()
                clip_grad_norm_(network.parameters(), clip)
                optim.step()

                time_ave = (time.time() - start_time) / batch
                time_left = (num_batches - batch) * time_ave

        if (args.adv_training
                or args.motivate) and skip_adv_tuning > args.delay:
            logger.info(
                'epoch: %d train: %d loss: %.4f, arc: %.4f, type: %.4f, dis_loss: (%.2f, %.2f), dis_acc: (%.2f, %.2f), '
                'gen_loss: %.2f, time: %.2fs' %
                (epoch, num_batches, train_err / train_total,
                 train_err_arc / train_total, train_err_type / train_total,
                 sum(loss_d_real) / len(loss_d_real), sum(loss_d_fake) /
                 len(loss_d_fake), sum(acc_d_real) / len(acc_d_real),
                 sum(acc_d_fake) / len(acc_d_fake),
                 sum(gen_loss) / len(gen_loss), time.time() - start_time))
        else:
            logger.info(
                'epoch: %d train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs'
                % (epoch, num_batches, train_err / train_total,
                   train_err_arc / train_total, train_err_type / train_total,
                   time.time() - start_time))

        ################# Validation on Dependency Parsing Only #################################
        if epoch % args.check_dev != 0:
            continue

        with torch.no_grad():
            # evaluate performance on dev data
            network.eval()

            dev_ucorr = 0.0
            dev_lcorr = 0.0
            dev_total = 0
            dev_ucomlpete = 0.0
            dev_lcomplete = 0.0
            dev_ucorr_nopunc = 0.0
            dev_lcorr_nopunc = 0.0
            dev_total_nopunc = 0
            dev_ucomlpete_nopunc = 0.0
            dev_lcomplete_nopunc = 0.0
            dev_root_corr = 0.0
            dev_total_root = 0.0
            dev_total_inst = 0.0

            for lang, data_dev in dev_data.items():
                for batch in conllx_data.iterate_batch_variable(
                        data_dev, batch_size):
                    word, char, pos, heads, types, masks, lengths, bert_inputs = batch

                    if use_gpu:
                        word = word.cuda()
                        char = char.cuda()
                        pos = pos.cuda()
                        heads = heads.cuda()
                        types = types.cuda()
                        masks = masks.cuda()
                        lengths = lengths.cuda()
                        if bert_inputs[0] is not None:
                            bert_inputs[0] = bert_inputs[0].cuda()
                            bert_inputs[1] = bert_inputs[1].cuda()
                            bert_inputs[2] = bert_inputs[2].cuda()

                    heads_pred, types_pred = decode(
                        word,
                        char,
                        pos,
                        input_bert=bert_inputs,
                        mask=masks,
                        length=lengths,
                        leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                    word = word.cpu().numpy()
                    pos = pos.cpu().numpy()
                    lengths = lengths.cpu().numpy()
                    heads = heads.cpu().numpy()
                    types = types.cpu().numpy()

                    stats, stats_nopunc, stats_root, num_inst = parser.eval(
                        word,
                        pos,
                        heads_pred,
                        types_pred,
                        heads,
                        types,
                        word_alphabet,
                        pos_alphabet,
                        lengths,
                        punct_set=punct_set,
                        symbolic_root=True)
                    ucorr, lcorr, total, ucm, lcm = stats
                    ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                    corr_root, total_root = stats_root

                    dev_ucorr += ucorr
                    dev_lcorr += lcorr
                    dev_total += total
                    dev_ucomlpete += ucm
                    dev_lcomplete += lcm

                    dev_ucorr_nopunc += ucorr_nopunc
                    dev_lcorr_nopunc += lcorr_nopunc
                    dev_total_nopunc += total_nopunc
                    dev_ucomlpete_nopunc += ucm_nopunc
                    dev_lcomplete_nopunc += lcm_nopunc

                    dev_root_corr += corr_root
                    dev_total_root += total_root
                    dev_total_inst += num_inst

            print(
                'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 /
                   dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete *
                   100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
            print(
                'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                %
                (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
                 dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
                 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
                 dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
            print('Root: corr: %d, total: %d, acc: %.2f%%' %
                  (dev_root_corr, dev_total_root,
                   dev_root_corr * 100 / dev_total_root))

            if dev_lcorrect_nopunc < dev_lcorr_nopunc or (
                    dev_lcorrect_nopunc == dev_lcorr_nopunc
                    and dev_ucorrect_nopunc < dev_ucorr_nopunc):
                dev_ucorrect_nopunc = dev_ucorr_nopunc
                dev_lcorrect_nopunc = dev_lcorr_nopunc
                dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
                dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

                dev_ucorrect = dev_ucorr
                dev_lcorrect = dev_lcorr
                dev_ucomlpete_match = dev_ucomlpete
                dev_lcomplete_match = dev_lcomplete

                dev_root_correct = dev_root_corr

                best_epoch = epoch
                patient = 0

                state_dict = network.module.state_dict(
                ) if args.parallel else network.state_dict()
                torch.save(state_dict, model_name)

            else:
                if dev_ucorr_nopunc * 100 / dev_total_nopunc < dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5 or patient >= schedule:
                    state_dict = torch.load(model_name)
                    if args.parallel:
                        network.module.load_state_dict(state_dict)
                    else:
                        network.load_state_dict(state_dict)

                    lr = lr * decay_rate
                    optim = generate_optimizer(opt, lr, network.parameters(),
                                               betas, gamma, eps, momentum)

                    if decoding == 'greedy':
                        decode = network.module.decode if args.parallel else network.decode
                    elif decoding == 'mst':
                        decode = network.module.decode_mst if args.parallel else network.decode_mst
                    else:
                        raise ValueError('Unknown decoding algorithm: %s' %
                                         decoding)

                    patient = 0
                    decay += 1
                    if decay % double_schedule_decay == 0:
                        schedule *= 2
                else:
                    patient += 1

            print(
                '----------------------------------------------------------------------------------------------------------------------------'
            )
            print(
                'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect, dev_lcorrect, dev_total, dev_ucorrect * 100 /
                   dev_total, dev_lcorrect * 100 / dev_total,
                   dev_ucomlpete_match * 100 / dev_total_inst,
                   dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
            print(
                'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
                   dev_ucorrect_nopunc * 100 / dev_total_nopunc,
                   dev_lcorrect_nopunc * 100 / dev_total_nopunc,
                   dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
                   dev_lcomplete_match_nopunc * 100 / dev_total_inst,
                   best_epoch))
            print(
                'best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (dev_root_correct, dev_total_root,
                   dev_root_correct * 100 / dev_total_root, best_epoch))
            print(
                '----------------------------------------------------------------------------------------------------------------------------'
            )
            if decay == max_decay:
                break

        torch.cuda.empty_cache()  # release memory that can be released
Пример #6
0
def biaffine(model_path, model_name, test_path, punct_set, use_gpu, logger, args):
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, \
    type_alphabet = conllx_data.create_alphabets(alphabet_path, None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    decoding = args.decode

    logger.info('use gpu: %s, decoding: %s' % (use_gpu, decoding))

    data_test = conllx_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet,
                                                  use_gpu=use_gpu, volatile=True, symbolic_root=True)

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)

    logger.info('model: %s' % model_name)

    def load_model_arguments_from_json():
        arguments = json.load(open(arg_path, 'r'))
        return arguments['args'], arguments['kwargs']

    arg_path = model_name + '.arg.json'
    args, kwargs = load_model_arguments_from_json()
    network = BiRecurrentConvBiAffine(*args, **kwargs)
    network.load_state_dict(torch.load(model_name))

    if use_gpu:
        network.cuda()
    else:
        network.cpu()

    network.eval()

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0
    test_total = 0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_total_nopunc = 0
    test_total_inst = 0

    test_root_correct = 0.0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    pred_writer.start('tmp/analyze_pred_%s' % str(uid))
    gold_writer.start('tmp/analyze_gold_%s' % str(uid))
    sent = 0
    start_time = time.time()

    for batch in conllx_data.iterate_batch_variable(data_test, 1):
        sys.stdout.write('%d, ' % sent)
        sys.stdout.flush()
        sent += 1

        word, char, pos, heads, types, masks, lengths = batch
        heads_pred, types_pred = decode(word, char, pos, mask=masks, length=lengths, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
        word = word.data.cpu().numpy()
        pos = pos.data.cpu().numpy()
        lengths = lengths.cpu().numpy()
        heads = heads.data.cpu().numpy()
        types = types.data.cpu().numpy()

        pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True)
        gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True)

        stats, stats_nopunc, stats_root, num_inst = parser.eval(word, pos, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths, punct_set=punct_set, symbolic_root=True)
        ucorr, lcorr, total, ucm, lcm = stats
        ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
        corr_root, total_root = stats_root

        test_ucorrect += ucorr
        test_lcorrect += lcorr
        test_total += total
        test_ucomlpete_match += ucm
        test_lcomplete_match += lcm

        test_ucorrect_nopunc += ucorr_nopunc
        test_lcorrect_nopunc += lcorr_nopunc
        test_total_nopunc += total_nopunc
        test_ucomlpete_match_nopunc += ucm_nopunc
        test_lcomplete_match_nopunc += lcm_nopunc

        test_root_correct += corr_root
        test_total_root += total_root

        test_total_inst += num_inst

    pred_writer.close()
    gold_writer.close()

    print('\ntime: %.2fs' % (time.time() - start_time))
    print('test W. Punct:  ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
        test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 / test_total, test_lcorrect * 100 / test_total,
        test_ucomlpete_match * 100 / test_total_inst, test_lcomplete_match * 100 / test_total_inst))
    print('test Wo Punct:  ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
        test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
        test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc,
        test_ucomlpete_match_nopunc * 100 / test_total_inst, test_lcomplete_match_nopunc * 100 / test_total_inst))
    print('test Root: corr: %d, total: %d, acc: %.2f%%' % (
        test_root_correct, test_total_root, test_root_correct * 100 / test_total_root))
def train_main(args):
    logger = get_logger("GraphParser")

    mode = args.model_type
    obj = args.objective
    decoding = args.decode
    train_path = args.train_data
    dev_path = args.dev_data
    test_path = args.test_data
    model_path = args.model_path
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    punctuation = args.punctuation

    freeze = args.freeze
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    char_path = args.char_path

    use_pos = args.pos
    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)

    (word_alphabet, char_alphabet, pos_alphabet,
     type_alphabet) = conllx_data.create_alphabets(
         alphabet_path,
         train_path,
         data_paths=[dev_path, test_path],
         max_vocabulary_size=100000,
         embedd_dict=word_dict)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    device = torch.device('cuda') if args.gpu else torch.device('cpu')

    data_train = conllx_data.read_data_to_tensor(train_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 pos_alphabet,
                                                 type_alphabet,
                                                 symbolic_root=True,
                                                 device=device)
    # data_train = conllx_data.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    # num_data = sum([len(bucket) for bucket in data_train])
    num_data = sum(data_train[1])

    data_dev = conllx_data.read_data_to_tensor(dev_path,
                                               word_alphabet,
                                               char_alphabet,
                                               pos_alphabet,
                                               type_alphabet,
                                               symbolic_root=True,
                                               device=device)

    data_test = conllx_data.read_data_to_tensor(test_path,
                                                word_alphabet,
                                                char_alphabet,
                                                pos_alphabet,
                                                type_alphabet,
                                                symbolic_root=True,
                                                device=device)

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        if freeze:
            table[conllx_data.UNK_ID, :] = np.zeros([1, word_dim
                                                     ]).astype(np.float32)
        else:
            table[conllx_data.UNK_ID, :] = np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                if freeze:
                    embedding = np.zeros([1, word_dim]).astype(np.float32)
                else:
                    embedding = np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    window = 3
    if obj == 'cross_entropy':
        network = BiRecurrentConvBiAffine(word_dim,
                                          num_words,
                                          char_dim,
                                          num_chars,
                                          pos_dim,
                                          num_pos,
                                          num_filters,
                                          window,
                                          mode,
                                          hidden_size,
                                          num_layers,
                                          num_types,
                                          arc_space,
                                          type_space,
                                          embedd_word=word_table,
                                          embedd_char=char_table,
                                          p_in=p_in,
                                          p_out=p_out,
                                          p_rnn=p_rnn,
                                          biaffine=True,
                                          pos=use_pos,
                                          char=use_char)
    elif obj == 'crf':
        raise NotImplementedError
    else:
        raise RuntimeError('Unknown objective: %s' % obj)

    def save_args():
        arg_path = model_name + '.arg.json'
        arguments = [
            word_dim, num_words, char_dim, num_chars, pos_dim, num_pos,
            num_filters, window, mode, hidden_size, num_layers, num_types,
            arc_space, type_space
        ]
        kwargs = {
            'p_in': p_in,
            'p_out': p_out,
            'p_rnn': p_rnn,
            'biaffine': True,
            'pos': use_pos,
            'char': use_char
        }
        json.dump({
            'args': arguments,
            'kwargs': kwargs
        },
                  open(arg_path, 'w'),
                  indent=4)

    if freeze:
        freeze_embedding(network.word_embedd)

    network = network.to(device)

    save_args()

    # pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    # gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)

    def generate_optimizer(opt, lr, params):
        params = filter(lambda param: param.requires_grad, params)
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adamax':
            return Adamax(params,
                          lr=lr,
                          betas=betas,
                          weight_decay=gamma,
                          eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters())
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    logger.info(
        "RNN: %s, num_layer=%d, hidden=%d, arc_space=%d, type_space=%d" %
        (mode, num_layers, hidden_size, arc_space, type_space))
    logger.info(
        "train: obj: %s, l2: %f, (#data: %d, batch: %d, clip: %.2f, unk replace: %.2f)"
        % (obj, gamma, num_data, batch_size, clip, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info("decoding algorithm: %s" % decoding)
    logger.info(opt_info)

    num_batches = num_data / batch_size + 1
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    patient = 0
    decay = 0
    max_decay = 9
    double_schedule_decay = 5

    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, lr=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, pos, heads, types, masks, lengths = conllx_data.get_batch_tensor(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss_arc, loss_type = network.loss(word,
                                               char,
                                               pos,
                                               heads,
                                               types,
                                               mask=masks,
                                               length=lengths)
            loss = loss_arc + loss_type
            loss.backward()
            clip_grad_norm_(network.parameters(), clip)
            optim.step()

            with torch.no_grad():
                num_inst = word.size(
                    0) if obj == 'crf' else masks.sum() - word.size(0)
                train_err += loss * num_inst
                train_err_arc += loss_arc * num_inst
                train_err_type += loss_type * num_inst
                train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 10 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, arc: %.4f, type: %.4f, time left: %.2fs' % (
                    batch, num_batches, train_err / train_total, train_err_arc
                    / train_total, train_err_type / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print(
            'train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs' %
            (num_batches, train_err / train_total, train_err_arc / train_total,
             train_err_type / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            # pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch)
            # pred_writer.start(pred_filename)
            # gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch)
            # gold_writer.start(gold_filename)

            dev_ucorr = 0.0
            dev_lcorr = 0.0
            dev_total = 0
            dev_ucomlpete = 0.0
            dev_lcomplete = 0.0
            dev_ucorr_nopunc = 0.0
            dev_lcorr_nopunc = 0.0
            dev_total_nopunc = 0
            dev_ucomlpete_nopunc = 0.0
            dev_lcomplete_nopunc = 0.0
            dev_root_corr = 0.0
            dev_total_root = 0.0
            dev_total_inst = 0.0
            for batch in conllx_data.iterate_batch_tensor(
                    data_dev, batch_size):
                word, char, pos, heads, types, masks, lengths = batch
                heads_pred, types_pred = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                word = word.cpu().numpy()
                pos = pos.cpu().numpy()
                lengths = lengths.cpu().numpy()
                heads = heads.cpu().numpy()
                types = types.cpu().numpy()

                # pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True)
                # gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True)

                stats, stats_nopunc, stats_root, num_inst = parser.eval(
                    word,
                    pos,
                    heads_pred,
                    types_pred,
                    heads,
                    types,
                    word_alphabet,
                    pos_alphabet,
                    lengths,
                    punct_set=punct_set,
                    symbolic_root=True)
                ucorr, lcorr, total, ucm, lcm = stats
                ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                corr_root, total_root = stats_root

                dev_ucorr += ucorr
                dev_lcorr += lcorr
                dev_total += total
                dev_ucomlpete += ucm
                dev_lcomplete += lcm

                dev_ucorr_nopunc += ucorr_nopunc
                dev_lcorr_nopunc += lcorr_nopunc
                dev_total_nopunc += total_nopunc
                dev_ucomlpete_nopunc += ucm_nopunc
                dev_lcomplete_nopunc += lcm_nopunc

                dev_root_corr += corr_root
                dev_total_root += total_root

                dev_total_inst += num_inst

            # pred_writer.close()
            # gold_writer.close()
            print(
                'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 /
                   dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete *
                   100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
            print(
                'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
                %
                (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
                 dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
                 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
                 dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
            print('Root: corr: %d, total: %d, acc: %.2f%%' %
                  (dev_root_corr, dev_total_root,
                   dev_root_corr * 100 / dev_total_root))

            if (dev_lcorrect_nopunc < dev_lcorr_nopunc
                    or (dev_lcorrect_nopunc == dev_lcorr_nopunc
                        and dev_ucorrect_nopunc < dev_ucorr_nopunc)):
                dev_ucorrect_nopunc = dev_ucorr_nopunc
                dev_lcorrect_nopunc = dev_lcorr_nopunc
                dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
                dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

                dev_ucorrect = dev_ucorr
                dev_lcorrect = dev_lcorr
                dev_ucomlpete_match = dev_ucomlpete
                dev_lcomplete_match = dev_lcomplete

                dev_root_correct = dev_root_corr

                best_epoch = epoch
                patient = 0
                # torch.save(network, model_name)
                torch.save(network.state_dict(), model_name)

                # pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch)
                # pred_writer.start(pred_filename)
                # gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch)
                # gold_writer.start(gold_filename)

                test_ucorrect = 0.0
                test_lcorrect = 0.0
                test_ucomlpete_match = 0.0
                test_lcomplete_match = 0.0
                test_total = 0

                test_ucorrect_nopunc = 0.0
                test_lcorrect_nopunc = 0.0
                test_ucomlpete_match_nopunc = 0.0
                test_lcomplete_match_nopunc = 0.0
                test_total_nopunc = 0
                test_total_inst = 0

                test_root_correct = 0.0
                test_total_root = 0
                for batch in conllx_data.iterate_batch_tensor(
                        data_test, batch_size):
                    word, char, pos, heads, types, masks, lengths = batch
                    heads_pred, types_pred = decode(
                        word,
                        char,
                        pos,
                        mask=masks,
                        length=lengths,
                        leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                    word = word.cpu().numpy()
                    pos = pos.cpu().numpy()
                    lengths = lengths.cpu().numpy()
                    heads = heads.cpu().numpy()
                    types = types.cpu().numpy()

                    # pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True)
                    # gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True)

                    (stats, stats_nopunc, stats_root,
                     num_inst) = parser.eval(word,
                                             pos,
                                             heads_pred,
                                             types_pred,
                                             heads,
                                             types,
                                             word_alphabet,
                                             pos_alphabet,
                                             lengths,
                                             punct_set=punct_set,
                                             symbolic_root=True)
                    ucorr, lcorr, total, ucm, lcm = stats
                    ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                    corr_root, total_root = stats_root

                    test_ucorrect += ucorr
                    test_lcorrect += lcorr
                    test_total += total
                    test_ucomlpete_match += ucm
                    test_lcomplete_match += lcm

                    test_ucorrect_nopunc += ucorr_nopunc
                    test_lcorrect_nopunc += lcorr_nopunc
                    test_total_nopunc += total_nopunc
                    test_ucomlpete_match_nopunc += ucm_nopunc
                    test_lcomplete_match_nopunc += lcm_nopunc

                    test_root_correct += corr_root
                    test_total_root += total_root

                    test_total_inst += num_inst

                # pred_writer.close()
                # gold_writer.close()
            else:
                if (dev_ucorr_nopunc * 100 / dev_total_nopunc <
                        dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5
                        or patient >= schedule):
                    # network = torch.load(model_name)
                    network.load_state_dict(torch.load(model_name))
                    lr = lr * decay_rate
                    optim = generate_optimizer(opt, lr, network.parameters())

                    if decoding == 'greedy':
                        decode = network.decode
                    elif decoding == 'mst':
                        decode = network.decode_mst
                    else:
                        raise ValueError('Unknown decoding algorithm: %s' %
                                         decoding)

                    patient = 0
                    decay += 1
                    if decay % double_schedule_decay == 0:
                        schedule *= 2
                else:
                    patient += 1

            print('-' * 124)
            print(
                'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect, dev_lcorrect, dev_total, dev_ucorrect * 100 /
                   dev_total, dev_lcorrect * 100 / dev_total,
                   dev_ucomlpete_match * 100 / dev_total_inst,
                   dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
            print(
                'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
                   dev_ucorrect_nopunc * 100 / dev_total_nopunc,
                   dev_lcorrect_nopunc * 100 / dev_total_nopunc,
                   dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
                   dev_lcomplete_match_nopunc * 100 / dev_total_inst,
                   best_epoch))
            print(
                'best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (dev_root_correct, dev_total_root,
                   dev_root_correct * 100 / dev_total_root, best_epoch))
            print('-' * 124)
            print(
                'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (test_ucorrect, test_lcorrect, test_total, test_ucorrect *
                   100 / test_total, test_lcorrect * 100 / test_total,
                   test_ucomlpete_match * 100 / test_total_inst,
                   test_lcomplete_match * 100 / test_total_inst, best_epoch))
            print(
                'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                %
                (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
                 test_ucorrect_nopunc * 100 / test_total_nopunc,
                 test_lcorrect_nopunc * 100 / test_total_nopunc,
                 test_ucomlpete_match_nopunc * 100 / test_total_inst,
                 test_lcomplete_match_nopunc * 100 / test_total_inst,
                 best_epoch))
            print(
                'best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (test_root_correct, test_total_root,
                   test_root_correct * 100 / test_total_root, best_epoch))
            print('=' * 124)

            if decay == max_decay:
                break
def biaffine(model_path, model_name, test_path, punct_set, use_gpu, logger, args):
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet, max_sent_length = conllx_data.create_alphabets(alphabet_path,
        None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)
    # word_alphabet, char_alphabet, pos_alphabet, type_alphabet = create_alphabets(alphabet_path,
    #     None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    decoding = args.decode
    out_filename = args.out_filename
    constraints_method = args.constraints_method
    constraintFile = args.constraint_file
    ratioFile = args.ratio_file
    tolerance = args.tolerance
    gamma = args.gamma
    the_language = args.mt_log[9:11]
    mt_log = open(args.mt_log, 'a')
    summary_log = open(args.summary_log, 'a')
    logger.info('use gpu: %s, decoding: %s' % (use_gpu, decoding))

    #
    extra_embeds_arr = augment_with_extra_embedding(word_alphabet, args.extra_embed, args.extra_embed_src, test_path, logger)

    # ===== the reading
    def _read_one(path, is_train):
        lang_id = guess_language_id(path)
        logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id))
        one_data = conllx_data.read_data_to_variable(path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=(not is_train), symbolic_root=True, lang_id=lang_id)
        return one_data

    data_test = _read_one(test_path, False)

    # data_test = conllx_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet,
    #                                               use_gpu=use_gpu, volatile=True, symbolic_root=True)

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet)

    logger.info('model: %s' % model_name)

    def load_model_arguments_from_json():
        arguments = json.load(open(arg_path, 'r'))
        return arguments['args'], arguments['kwargs']

    arg_path = model_name + '.arg.json'
    args, kwargs = load_model_arguments_from_json()
    network = BiRecurrentConvBiAffine(use_gpu=use_gpu, *args, **kwargs)
    network.load_state_dict(torch.load(model_name))

    #
    augment_network_embed(word_alphabet.size(), network, extra_embeds_arr)

    if use_gpu:
        network.cuda()
    else:
        network.cpu()

    network.eval()


    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    elif decoding == 'proj':
        decode = network.decode_proj
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    # pred_writer.start('tmp/analyze_pred_%s' % str(uid))
    # gold_writer.start('tmp/analyze_gold_%s' % str(uid))
    # pred_writer.start(model_path + out_filename + '_pred')
    # gold_writer.start(model_path + out_filename + '_gold')
    pred_writer.start(out_filename + '_pred')
    gold_writer.start(out_filename + '_gold')

    sent = 0
    start_time = time.time()

    constraints = []
    
    mt_log.write("=====================%s, Ablation 2================\n"%(constraints_method))
    summary_log.write("==========================%s, Ablation 2=============\n"%(constraints_method))
    if ratioFile == 'WALS':
        import pickle as pk
        cFile = open(constraintFile, 'rb')
        WALS_data = pk.load(cFile)
        for idx in ['85A', '87A', '89A']:
            constraint = Constraint(0,0,0)
            extra_const = constraint.load_WALS(idx, WALS_data[the_language][idx], pos_alphabet, method=constraints_method)
            constraints.append(constraint)
            if extra_const:
                constraints.append(extra_const)
        constraint = Constraint(0,0,0)
        extra_const = constraint.load_WALS_unary(WALS_data[the_language], pos_alphabet, method=constraints_method)
        if extra_const:
            constraints.append(extra_const)
        constraints.append(constraint)
    elif ratioFile == 'None':
        summary_log.write("=================No it is baseline================\n")
        mt_log.write("==================No it is baseline==============\n")
    else:
        cFile = open(constraintFile, 'r')
        for line in cFile:
            if len(line.strip()) < 2:
               break
            pos1, pos2 = line.strip().split('\t')
            constraint = Constraint(0,0,0)
            constraint.load(pos1, pos2, ratioFile, pos_alphabet)
            constraints.append(constraint)
    
    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0
    test_total = 0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_total_nopunc = 0
    test_total_inst = 0

    test_root_correct = 0.0
    test_total_root = 0
    arc_list = []
    type_list = []
    length_list = []
    pos_list = []
    
    for batch in conllx_data.iterate_batch_variable(data_test, 1):
        word, char, pos, heads, types, masks, lengths = batch
        out_arc, out_type, length = network.pretrain_constraint(word, char, pos, mask=masks, length=lengths, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
        arc_list += list(out_arc)
        type_list += list(out_type)
        length_list += list(length)
        pos_list += list(pos)
        
    if constraints_method == 'binary':
        train_constraints = network.binary_constraints
    if constraints_method == 'Lagrange':
        train_constraints = network.Lagrange_constraints
    if constraints_method == 'PR':
        train_constraints = network.PR_constraints
    train_constraints(arc_list, type_list, length_list, pos_list, constraints, tolerance, mt_log, gamma=gamma)        

    for batch in conllx_data.iterate_batch_variable(data_test, 1):
        #sys.stdout.write('%d, ' % sent)
        #sys.stdout.flush()
        sent += 1

        word, char, pos, heads, types, masks, lengths = batch
        heads_pred, types_pred = decode(word, char, pos, mask=masks, length=lengths,
                                        leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS, constraints=constraints, method=constraints_method, gamma=gamma)
        word = word.data.cpu().numpy()
        pos = pos.data.cpu().numpy()
        lengths = lengths.cpu().numpy()
        heads = heads.data.cpu().numpy()
        types = types.data.cpu().numpy()

        pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True)
        gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True)

        stats, stats_nopunc, stats_root, num_inst = parser.eval(word, pos, heads_pred, types_pred, heads, types,
                                                                word_alphabet, pos_alphabet, lengths,
                                                                punct_set=punct_set, symbolic_root=True)
        ucorr, lcorr, total, ucm, lcm = stats
        ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
        corr_root, total_root = stats_root

        test_ucorrect += ucorr
        test_lcorrect += lcorr
        test_total += total
        test_ucomlpete_match += ucm
        test_lcomplete_match += lcm

        test_ucorrect_nopunc += ucorr_nopunc
        test_lcorrect_nopunc += lcorr_nopunc
        test_total_nopunc += total_nopunc
        test_ucomlpete_match_nopunc += ucm_nopunc
        test_lcomplete_match_nopunc += lcm_nopunc

        test_root_correct += corr_root
        test_total_root += total_root

        test_total_inst += num_inst

    print('\ntime: %.2fs' % (time.time() - start_time))
    print('test W. Punct:  ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
        test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 / test_total, test_lcorrect * 100 / test_total,
        test_ucomlpete_match * 100 / test_total_inst, test_lcomplete_match * 100 / test_total_inst))
    print('test Wo Punct:  ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
        test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
        test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc,
        test_ucomlpete_match_nopunc * 100 / test_total_inst, test_lcomplete_match_nopunc * 100 / test_total_inst))
    print('test Root: corr: %d, total: %d, acc: %.2f%%' % (
        test_root_correct, test_total_root, test_root_correct * 100 / test_total_root))
    mt_log.write('uas: %.2f, las: %.2f\n'%(test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc))
    summary_log.write('%s: %.2f %.2f\n'%(the_language, test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc))
    pred_writer.close()
    gold_writer.close()
Пример #9
0
class BiaffineModel(object):
    def __init__(self, model_path, model_name):
        print("................................................")
        print("LOADING Biaffine Model")
        alphabet_path = os.path.join(model_path, 'alphabets/')
        model_name = os.path.join(model_path, model_name)

        self.word_alpha, self.char_alpha, self.tag_alpha, self.type_alpha = conllx_data.create_alphabets(alphabet_path, None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)
        self.id2word = {v: k for k, v in self.word_alpha.instance2index.iteritems()}
        
        num_words = self.word_alpha.size()
        num_chars = self.char_alpha.size()
        num_pos = self.tag_alpha.size()
        num_types = self.type_alpha.size()

        print("Word Alphabet Size: %d" % num_words)
        print("Character Alphabet Size: %d" % num_chars)
        print("POS Alphabet Size: %d" % num_pos)
        print("Type Alphabet Size: %d" % num_types)


        def load_model_arguments_from_json():
            arguments = json.load(open(arg_path, 'r'))
            return arguments['args'], arguments['kwargs']

        arg_path = model_name + '.arg.json'
        args, kwargs = load_model_arguments_from_json()
        self.network = BiRecurrentConvBiAffine(*args, **kwargs)
        self.network.load_state_dict(torch.load(model_name))
        
        self.network.id2word = self.id2word
        self.network.cuda()
        self.network.eval()

    def prepare_data(self, sentences, use_gpu=True):
        ret_value = []
        for sentence in sentences:
            inst_size = sentence.length()
            data = None
            max_len = 0
            bucket = 0
            for bucket_size in _buckets:
                if inst_size < bucket_size:
                    bucket = bucket_size
                    data = [sentence.word_ids, sentence.seq_char_ids, sentence.tag_ids]
                    max_len = max([len(seq_char) for seq_char in sentence.seq_chars])
                    break
            if data is None: # meaning the sentence is too long, we cut it into 300 length
                bucket = _buckets[-1]
                data = [sentence.word_ids[:bucket], sentence.seq_char_ids[:bucket], sentence.tag_ids[:bucket]]
                max_len = max([len(seq_char) for seq_char in sentence.seq_chars])
                

            char_length = min(utils.MAX_CHAR_LENGTH, max_len + utils.NUM_CHAR_PAD)
            wid_inputs = np.empty([1, bucket], dtype=np.int64)
            cid_inputs = np.empty([1, bucket, char_length], dtype=np.int64)
            pid_inputs = np.empty([1, bucket], dtype=np.int64)

            masks = np.zeros([1, bucket], dtype=np.float32)
            single = np.zeros([1, bucket], dtype=np.int64)

            lengths = np.empty(bucket, dtype=np.int64)

            wids = data[0]
            cid_seqs = data[1]
            pids = data[2]
            inst_size = len(wids)
            lengths[0] = inst_size
            # word ids
            wid_inputs[0, :inst_size] = wids
            wid_inputs[0, inst_size:] = PAD_ID_WORD
            for c, cids in enumerate(cid_seqs):
                limit = len(cids)
                if limit > char_length: limit = char_length
                try:
                    cid_inputs[0, c, :limit] = cids[:limit]
                    cid_inputs[0, c, limit:] = PAD_ID_CHAR
                except:
                    import ipdb; ipdb.set_trace()
            cid_inputs[0, inst_size:, :] = PAD_ID_CHAR
            # pos ids
            pid_inputs[0, :inst_size] = pids
            pid_inputs[0, inst_size:] = PAD_ID_TAG
            # masks
            masks[0, :inst_size] = 1.0
            for j, wid in enumerate(wids):
                if self.word_alpha.is_singleton(wid):
                    single[0, j] = 1

            words = Variable(torch.from_numpy(wid_inputs), volatile=False)
            chars = Variable(torch.from_numpy(cid_inputs), volatile=False)
            pos = Variable(torch.from_numpy(pid_inputs), volatile=False)
            masks = Variable(torch.from_numpy(masks), volatile=False)
            single = Variable(torch.from_numpy(single), volatile=False)
            lengths = torch.from_numpy(lengths)
            if use_gpu:
                words = words.cuda()
                chars = chars.cuda()
                pos = pos.cuda()
                masks = masks.cuda()
                single = single.cuda()
                lengths = lengths.cuda()
            index = slice(0,1)
            ret_value.append((words[index], chars[index], pos[index], masks[index], lengths[index], sentence.words, sentence.edu_ids))
        return ret_value

    def get_syntax_feature(self, data_test, sentences):
        sent = 0
        syntax_features = []
        for data in data_test:
            cur_length = len(sentences[sent].words)
            word, char, pos, masks, lengths, original_words, edu_ids = data
            sent += 1
            syntax_feature = self.network.get_syntax_feature(original_words, word, char, pos, mask=masks, length=lengths)
            _ , sent_len, dim = syntax_feature.shape
            if sent_len != cur_length:
                assert sent_len < cur_length
                diff = cur_length - sent_len
                zeros = Variable(torch.zeros(1, diff, dim)).type(torch.FloatTensor).cuda()
                syntax_feature = torch.cat([syntax_feature, zeros], dim=1)
            syntax_features.append(syntax_feature)
        return syntax_features
Пример #10
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with stack pointer parser')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=200,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    #args_parser.add_argument('--decoder_input_size', type=int, default=256, help='Number of input units in decoder RNN.')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--encoder_layers',
                             type=int,
                             default=1,
                             help='Number of layers of encoder RNN')
    #args_parser.add_argument('--decoder_layers', type=int, default=1, help='Number of layers of decoder RNN')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    # NOTE: action='store_true' is just to set ON
    args_parser.add_argument('--pos',
                             action='store_true',
                             help='use part-of-speech embedding.')
    args_parser.add_argument('--char',
                             action='store_true',
                             help='use character embedding and CNN.')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    # NOTE: arg MUST be one of choices(when specified)
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.001,
                             help='Learning rate')
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.75,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--max_decay',
                             type=int,
                             default=9,
                             help='Number of decays before stop')
    args_parser.add_argument('--double_schedule_decay',
                             type=int,
                             default=5,
                             help='Number of decays to double schedule')
    args_parser.add_argument('--clip',
                             type=float,
                             default=1.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--coverage',
                             type=float,
                             default=0.0,
                             help='weight for coverage loss')
    args_parser.add_argument('--p_rnn',
                             nargs=2,
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    args_parser.add_argument('--label_smooth',
                             type=float,
                             default=1.0,
                             help='weight of label smoothing method')
    args_parser.add_argument('--skipConnect',
                             action='store_true',
                             help='use skip connection for decoder RNN.')
    args_parser.add_argument('--grandPar',
                             action='store_true',
                             help='use grand parent.')
    args_parser.add_argument('--sibling',
                             action='store_true',
                             help='use sibling.')
    args_parser.add_argument(
        '--prior_order',
        choices=['inside_out', 'left2right', 'deep_first', 'shallow_first'],
        help='prior order of children.',
        required=True)
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument('--beam',
                             type=int,
                             default=1,
                             help='Beam size for decoding')
    args_parser.add_argument(
        '--word_embedding',
        choices=['glove', 'senna', 'sskip', 'polyglot', 'NNLM'],
        help='Embedding for words',
        required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)
    # TODO: to include in logging process
    args_parser.add_argument('--pos_embedding',
                             choices=[1, 2, 4],
                             type=int,
                             help='Embedding method for korean POS tag',
                             default=2)
    args_parser.add_argument('--pos_path', help='path for pos embedding dict')
    args_parser.add_argument('--elmo',
                             action='store_true',
                             help='use elmo embedding.')
    args_parser.add_argument('--elmo_path',
                             help='path for elmo embedding model.')
    args_parser.add_argument('--elmo_dim',
                             type=int,
                             help='dimension for elmo embedding model')
    #args_parser.add_argument('--fine_tune_path', help='fine tune starting from this state_dict')
    args_parser.add_argument('--model_version',
                             help='previous model version to load')

    #hoon : bert
    args_parser.add_argument(
        '--bert', action='store_true',
        help='use elmo embedding.')  # true if use bert(hoon)
    args_parser.add_argument(
        '--etri_train',
        help='path for etri data of bert')  # etri train path(hoon)
    args_parser.add_argument(
        '--etri_dev', help='path for etri data of bert')  # etri dev path(hoon)
    args_parser.add_argument('--bert_path',
                             help='path for bert embedding model.')  # yjyj
    args_parser.add_argument('--bert_dim',
                             type=int,
                             help='dimension for bert embedding model')  # yjyj
    args_parser.add_argument('--bert_learning_rate',
                             type=float,
                             default=5e-5,
                             help='Bert Learning rate')

    args_parser.add_argument('--decode',
                             choices=['mst', 'greedy'],
                             help='decoding algorithm',
                             required=True)  #yj
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')

    args = args_parser.parse_args()

    logger = get_logger("PtrParser")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    model_path = args.model_path + uid + '/'  # for numerous experiments
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    #input_size_decoder = args.decoder_input_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    encoder_layers = args.encoder_layers
    #decoder_layers = args.decoder_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    cov = args.coverage
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    label_smooth = args.label_smooth
    unk_replace = args.unk_replace
    prior_order = args.prior_order
    skipConnect = args.skipConnect
    grandPar = args.grandPar
    sibling = args.sibling
    beam = args.beam
    punctuation = args.punctuation

    freeze = args.freeze
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    # QUESTION: pretrained vector for char?
    char_path = args.char_path

    use_pos = False
    pos_embedding = args.pos_embedding
    pos_path = args.pos_path
    pos_dict = None
    pos_dim = args.pos_dim  # NOTE pretrain 있을 경우 pos_dim은 그거 따라감
    if pos_path is not None:
        pos_dict, pos_dim = utils.load_embedding_dict(
            word_embedding,
            pos_path)  # NOTE 임시적으로 word_embedding(NNLM)이랑 같은 형식
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    use_elmo = args.elmo
    elmo_path = args.elmo_path
    elmo_dim = args.elmo_dim
    #fine_tune_path = args.fine_tune_path

    #bert(hoon)
    use_bert = args.bert
    #bert yj
    bert_path = args.bert_path
    bert_dim = args.bert_dim
    bert_lr = args.bert_learning_rate

    etri_train_path = args.etri_train
    etri_dev_path = args.etri_dev

    obj = args.objective
    decoding = args.decode

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    # min_occurence=1
    data_paths = [dev_path, test_path] if test_path else [dev_path]
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_stacked_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=data_paths,
        max_vocabulary_size=50000,
        pos_embedding=pos_embedding,
        embedd_dict=word_dict)

    num_words = word_alphabet.size()  # 30268
    num_chars = char_alphabet.size()  # 3545
    num_pos = pos_alphabet.size()  # 46
    num_types = type_alphabet.size()  # 39

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    # data is a list of tuple containing tensors, etc ...
    data_train = conllx_stacked_data.read_stacked_data_to_variable(
        train_path,
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        type_alphabet,
        pos_embedding,
        use_gpu=1,
        prior_order=prior_order,
        elmo=use_elmo,
        bert=use_bert,
        etri_path=etri_train_path)
    num_data = sum(data_train[2])

    data_dev = conllx_stacked_data.read_stacked_data_to_variable(
        dev_path,
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        type_alphabet,
        pos_embedding,
        use_gpu=use_gpu,
        volatile=True,
        prior_order=prior_order,
        elmo=use_elmo,
        bert=use_bert,
        etri_path=etri_dev_path)
    if test_path:
        data_test = conllx_stacked_data.read_stacked_data_to_variable(
            test_path,
            word_alphabet,
            char_alphabet,
            pos_alphabet,
            type_alphabet,
            pos_embedding,
            use_gpu=use_gpu,
            volatile=True,
            prior_order=prior_order,
            elmo=use_elmo)

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        # NOTE: UNK 관리!
        table[conllx_stacked_data.UNK_ID, :] = np.zeros([1, word_dim]).astype(
            np.float32) if freeze else np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in list(word_alphabet.items()):
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                # NOTE: words not in pretrained are set to random
                embedding = np.zeros([1, word_dim]).astype(
                    np.float32) if freeze else np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_stacked_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        #for char, index, in char_alphabet.items():
        for char, index in list(char_alphabet.items()):
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_pos_embedding_table():
        if pos_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_pos, pos_dim], dtype=np.float32)
        for pos, index in list(pos_alphabet.items()):
            if pos in pos_dict:
                embedding = pos_dict[pos]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
            table[index, :] = embedding
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()
    pos_table = construct_pos_embedding_table()

    window = 3

    # yj 수정
    # network = StackPtrNet(word_dim, num_words, char_dim, num_chars, pos_dim, num_pos, num_filters, window,
    #                       mode, input_size_decoder, hidden_size, encoder_layers, decoder_layers,
    #                       num_types, arc_space, type_space, pos_embedding,
    #                       embedd_word=word_table, embedd_char=char_table, embedd_pos=pos_table, p_in=p_in, p_out=p_out,
    #                       p_rnn=p_rnn, biaffine=True, pos=use_pos, char=use_char, elmo=use_elmo, prior_order=prior_order,
    #                       skipConnect=skipConnect, grandPar=grandPar, sibling=sibling, elmo_path=elmo_path, elmo_dim=elmo_dim,
    #                       bert = use_bert, bert_path=bert_path, bert_dim=bert_dim)

    network = BiRecurrentConvBiAffine(word_dim,
                                      num_words,
                                      char_dim,
                                      num_chars,
                                      pos_dim,
                                      num_pos,
                                      num_filters,
                                      window,
                                      mode,
                                      hidden_size,
                                      encoder_layers,
                                      num_types,
                                      arc_space,
                                      type_space,
                                      embedd_word=word_table,
                                      embedd_char=char_table,
                                      embedd_pos=pos_table,
                                      p_in=p_in,
                                      p_out=p_out,
                                      p_rnn=p_rnn,
                                      biaffine=True,
                                      pos=use_pos,
                                      char=use_char,
                                      elmo=use_elmo,
                                      elmo_path=elmo_path,
                                      elmo_dim=elmo_dim,
                                      bert=use_bert,
                                      bert_path=bert_path,
                                      bert_dim=bert_dim)

    # if fine_tune_path is not None:
    #     pretrained_dict = torch.load(fine_tune_path)
    #     model_dict = network.state_dict()
    #     # select
    #     #model_dict['pos_embedd.weight'] = pretrained_dict['pos_embedd.weight']
    #     model_dict['word_embedd.weight'] = pretrained_dict['word_embedd.weight']
    #     #model_dict['char_embedd.weight'] = pretrained_dict['char_embedd.weight']
    #     network.load_state_dict(model_dict)

    model_ver = args.model_version
    if model_ver is not None:
        savePath = args.model_path + model_ver + 'network.pt'
        network.load_state_dict(torch.load(savePath))
        logger.info('Load model: %s' % (model_ver))

    def save_args():
        arg_path = model_name + '.arg.json'
        arguments = [
            word_dim, num_words, char_dim, num_chars, pos_dim, num_pos,
            num_filters, window, mode, hidden_size, encoder_layers, num_types,
            arc_space, type_space, pos_embedding
        ]
        kwargs = {
            'p_in': p_in,
            'p_out': p_out,
            'p_rnn': p_rnn,
            'biaffine': True,
            'pos': use_pos,
            'char': use_char,
            'elmo': use_elmo,
            'bert': use_bert
        }
        json.dump({
            'args': arguments,
            'kwargs': kwargs
        },
                  open(arg_path, 'w', encoding="utf-8"),
                  indent=4)

        with open(arg_path + '.raw_args', 'w', encoding="utf-8") as f:
            f.write(str(args))

    if freeze:
        network.word_embedd.freeze()

    if use_gpu:
        network.cuda()

    save_args()

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet, pos_embedding)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet, pos_embedding)

    def generate_optimizer(opt, lr, params):
        # params = [param for name, param in params if param.requires_grad]
        params = [param for name, param in params]
        if True:
            return AdamW(params, lr=lr, betas=betas, weight_decay=gamma)
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adamax':
            return Adamax(params,
                          lr=lr,
                          betas=betas,
                          weight_decay=gamma,
                          eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    # 우선 huggingface 기본 bert option으로 수정
    def generate_bert_optimizer(t_total, bert_lr, model):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            gamma
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=bert_lr, eps=1e-8)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=0,
                                         t_total=t_total)

        return scheduler, optimizer

    lr = learning_rate
    if use_bert:
        scheduler, optim = generate_bert_optimizer(
            len(data_train) * num_epochs, lr, network)
    #optim = generate_optimizer(opt, lr, network.named_parameters())

    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    #logger.info("RNN: %s, num_layer=(%d, %d), input_dec=%d, hidden=%d, arc_space=%d, type_space=%d" % (mode, encoder_layers, decoder_layers, input_size_decoder, hidden_size, arc_space, type_space))
    logger.info(
        "train: cov: %.1f, (#data: %d, batch: %d, clip: %.2f, label_smooth: %.2f, unk_repl: %.2f)"
        % (cov, num_data, batch_size, clip, label_smooth, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info('prior order: %s, grand parent: %s, sibling: %s, ' %
                (prior_order, grandPar, sibling))
    logger.info('skip connect: %s, beam: %d' % (skipConnect, beam))
    logger.info(opt_info)

    num_batches = int(num_data / batch_size + 1)  # kwon
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    patient = 0
    decay = 0
    max_decay = args.max_decay
    double_schedule_decay = args.double_schedule_decay
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0

        network.train()
        for batch in range(1, num_batches + 1):
            # load data
            input_encoder, _ = conllx_stacked_data.get_batch_stacked_variable(
                data_train,
                batch_size,
                pos_embedding,
                unk_replace=unk_replace,
                elmo=use_elmo,
                bert=use_bert)

            word_elmo = None
            if use_elmo:
                word, char, pos, heads, types, masks, lengths, word_elmo, word_bert = input_encoder
            else:
                word, char, pos, heads, types, masks, lengths, word_bert = input_encoder

            #stacked_heads, children, sibling, stacked_types, skip_connect, masks_d, lengths_d = input_decoder

            optim.zero_grad()

            # yjyj
            loss_arc, loss_type, bert_word_feature_ids, bert_morp_feature_ids = network.loss(
                word,
                char,
                pos,
                heads,
                types,
                mask=masks,
                length=lengths,
                input_word_bert=word_bert)

            # loss_arc_leaf, loss_arc_non_leaf, \
            # loss_type_leaf, loss_type_non_leaf, \
            # loss_cov, num_leaf, num_non_leaf = network.loss(word, char, pos, heads, stacked_heads, children, sibling, stacked_types, label_smooth, skip_connect=skip_connect, mask_e=masks_e, \
            #                                                 length_e=lengths_e, mask_d=masks_d, length_d=lengths_d, input_word_elmo = word_elmo, input_word_bert = word_bert)

            # loss_arc = loss_arc_leaf + loss_arc_non_leaf
            # loss_type = loss_type_leaf + loss_type_non_leaf
            # loss = loss_arc + loss_type + cov * loss_cov    # cov is set to 0 by default
            loss = loss_arc + loss_type
            loss.backward()
            clip_grad_norm_(network.parameters(), clip)
            optim.step()
            if use_bert:
                pass
                #bert_optim.step()
                #scheduler.step()

            num_inst = word.size(
                0) if obj == 'crf' else masks.data.sum() - word.size(0)
            train_err += loss.item() * num_inst
            train_err_arc += loss_arc.item() * num_inst
            train_err_type += loss_type.item() * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # yjyj
            # num_leaf = num_leaf.item()
            # num_non_leaf = num_non_leaf.item()
            # train_err_arc_leaf += loss_arc_leaf.item() * num_leaf
            # train_err_arc_non_leaf += loss_arc_non_leaf.item() * num_non_leaf
            #
            # train_err_type_leaf += loss_type_leaf.item() * num_leaf
            # train_err_type_non_leaf += loss_type_non_leaf.item() * num_non_leaf
            #
            # train_err_cov += loss_cov.item() * (num_leaf + num_non_leaf)
            # train_total_leaf += num_leaf
            # train_total_non_leaf += num_non_leaf
            #
            # time_ave = (time.time() - start_time) / batch
            # time_left = (num_batches - batch) * time_ave

            # update log

            # update log
            if batch % 10 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, arc: %.4f, type: %.4f, time left: %.2fs' % (
                    batch, num_batches, train_err / train_total, train_err_arc
                    / train_total, train_err_type / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print(
            'train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs' %
            (num_batches, train_err / train_total, train_err_arc / train_total,
             train_err_type / train_total, time.time() - start_time))
        # yjyj
        #     if batch % 10 == 0:
        #         sys.stdout.write("\b" * num_back)
        #         sys.stdout.write(" " * num_back)
        #         sys.stdout.write("\b" * num_back)
        #         err_arc_leaf = train_err_arc_leaf / train_total_leaf
        #         err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf
        #         err_arc = err_arc_leaf + err_arc_non_leaf
        #
        #         err_type_leaf = train_err_type_leaf / train_total_leaf
        #         err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf
        #         err_type = err_type_leaf + err_type_non_leaf
        #
        #         err_cov = train_err_cov / (train_total_leaf + train_total_non_leaf)
        #
        #         err = err_arc + err_type + cov * err_cov
        #         log_info = 'train: %d/%d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time left (estimated): %.2fs' % (
        #             batch, num_batches, err, err_arc, err_arc_leaf, err_arc_non_leaf, err_type, err_type_leaf, err_type_non_leaf, err_cov, time_left)
        #         sys.stdout.write(log_info)
        #         sys.stdout.flush()
        #         num_back = len(log_info)
        #
        # sys.stdout.write("\b" * num_back)
        # sys.stdout.write(" " * num_back)
        # sys.stdout.write("\b" * num_back)
        # err_arc_leaf = train_err_arc_leaf / train_total_leaf
        # err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf
        # err_arc = err_arc_leaf + err_arc_non_leaf
        #
        # err_type_leaf = train_err_type_leaf / train_total_leaf
        # err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf
        # err_type = err_type_leaf + err_type_non_leaf
        #
        # err_cov = train_err_cov / (train_total_leaf + train_total_non_leaf)
        #
        # err = err_arc + err_type + cov * err_cov
        # print('train: %d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time: %.2fs' % (
        #     num_batches, err, err_arc, err_arc_leaf, err_arc_non_leaf, err_type, err_type_leaf, err_type_non_leaf, err_cov, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        pred_filename = model_path + 'tmp/pred_dev%d' % (epoch)
        pred_writer.start(pred_filename)
        gold_filename = model_path + 'tmp/gold_dev%d' % (epoch)
        gold_writer.start(gold_filename)

        dev_ucorr = 0.0
        dev_lcorr = 0.0
        dev_total = 0
        dev_ucomlpete = 0.0
        dev_lcomplete = 0.0
        dev_ucorr_nopunc = 0.0
        dev_lcorr_nopunc = 0.0
        dev_total_nopunc = 0
        dev_ucomlpete_nopunc = 0.0
        dev_lcomplete_nopunc = 0.0
        dev_root_corr = 0.0
        dev_total_root = 0.0
        dev_total_inst = 0.0
        for batch in conllx_stacked_data.iterate_batch_stacked_variable(
                data_dev, batch_size, pos_embedding, type='dev',
                elmo=use_elmo):
            input_encoder, _ = batch
            #@TODO 여기 input word elmo랑 input word bert 처리
            if use_elmo:
                word, char, pos, heads, types, masks, lengths, word_elmo, word_bert = input_encoder
                heads_pred, types_pred = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS)
                # heads_pred, types_pred, _, _ = network.decode(word, char, pos, input_word_elmo=word_elmo, mask=masks,
                #                                               length=lengths, beam=beam,
                #                                               leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS, input_word_bert=word_bert)
            else:
                word, char, pos, heads, types, masks, lengths, word_bert = input_encoder
                heads_pred, types_pred, bert_word_feature_ids, bert_morp_feature_ids = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS,
                    input_word_bert=word_bert)
                # heads_pred, types_pred, _, _ = network.decode(word, char, pos, mask=masks, length=lengths, beam=beam,
                #                                               leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS, input_word_bert=word_bert)

            word = word.data.cpu().numpy()
            pos = pos.data.cpu().numpy()
            lengths = lengths.cpu().numpy()
            heads = heads.data.cpu().numpy()
            types = types.data.cpu().numpy()

            pred_writer.write(word,
                              pos,
                              heads_pred,
                              types_pred,
                              lengths,
                              symbolic_root=True)
            gold_writer.write(word,
                              pos,
                              heads,
                              types,
                              lengths,
                              symbolic_root=True)

            stats, stats_nopunc, stats_root, num_inst = parser_bpe.eval(
                word,
                pos,
                heads_pred,
                types_pred,
                heads,
                types,
                word_alphabet,
                pos_alphabet,
                lengths,
                punct_set=punct_set,
                symbolic_root=True,
                bert_word_feature_ids=bert_word_feature_ids,
                bert_morp_feature_ids=bert_morp_feature_ids)
            ucorr, lcorr, total, ucm, lcm = stats
            ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
            corr_root, total_root = stats_root

            dev_ucorr += ucorr
            dev_lcorr += lcorr
            dev_total += total
            dev_ucomlpete += ucm
            dev_lcomplete += lcm

            dev_ucorr_nopunc += ucorr_nopunc
            dev_lcorr_nopunc += lcorr_nopunc
            dev_total_nopunc += total_nopunc
            dev_ucomlpete_nopunc += ucm_nopunc
            dev_lcomplete_nopunc += lcm_nopunc

            dev_root_corr += corr_root
            dev_total_root += total_root

            dev_total_inst += num_inst

        pred_writer.close()
        gold_writer.close()
        print(
            'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total,
               dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 /
               dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
        print(
            'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
               dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
               100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
               dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
        print('Root: corr: %d, total: %d, acc: %.2f%%' %
              (dev_root_corr, dev_total_root,
               dev_root_corr * 100 / dev_total_root))

        if dev_ucorrect_nopunc * 1.5 + dev_lcorrect_nopunc < dev_ucorr_nopunc * 1.5 + dev_lcorr_nopunc:
            dev_ucorrect_nopunc = dev_ucorr_nopunc
            dev_lcorrect_nopunc = dev_lcorr_nopunc
            dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
            dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

            dev_ucorrect = dev_ucorr
            dev_lcorrect = dev_lcorr
            dev_ucomlpete_match = dev_ucomlpete
            dev_lcomplete_match = dev_lcomplete

            dev_root_correct = dev_root_corr

            best_epoch = epoch
            patient = 0
            # torch.save(network, model_name)
            torch.save(network.state_dict(), model_name)
            # save embedding to txt
            # FIXME format!
            #with open(model_path + 'embedding.txt', 'w') as f:
            #    for word, idx in word_alphabet.items():
            #        embedding = network.word_embedd.weight[idx, :]
            #        f.write('{}\t{}\n'.format(word, embedding))

            if test_path:
                pred_filename = model_path + 'tmp/%spred_test%d' % (str(uid),
                                                                    epoch)
                pred_writer.start(pred_filename)
                gold_filename = model_path + 'tmp/%sgold_test%d' % (str(uid),
                                                                    epoch)
                gold_writer.start(gold_filename)

                test_ucorrect = 0.0
                test_lcorrect = 0.0
                test_ucomlpete_match = 0.0
                test_lcomplete_match = 0.0
                test_total = 0

                test_ucorrect_nopunc = 0.0
                test_lcorrect_nopunc = 0.0
                test_ucomlpete_match_nopunc = 0.0
                test_lcomplete_match_nopunc = 0.0
                test_total_nopunc = 0
                test_total_inst = 0

                test_root_correct = 0.0
                test_total_root = 0
                for batch in conllx_stacked_data.iterate_batch_stacked_variable(
                        data_test, batch_size, pos_embedding, type='dev'):
                    input_encoder, _ = batch
                    word, char, pos, heads, types, masks, lengths = input_encoder

                    # yjyj
                    # heads_pred, types_pred, _, _ = network.decode(word, char, pos, mask=masks, length=lengths, beam=beam, leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS)
                    heads_pred, types_pred = decode(
                        word,
                        char,
                        pos,
                        mask=masks,
                        length=lengths,
                        leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS)
                    word = word.data.cpu().numpy()
                    pos = pos.data.cpu().numpy()
                    lengths = lengths.cpu().numpy()
                    heads = heads.data.cpu().numpy()
                    types = types.data.cpu().numpy()

                    pred_writer.write(word,
                                      pos,
                                      heads_pred,
                                      types_pred,
                                      lengths,
                                      symbolic_root=True)
                    gold_writer.write(word,
                                      pos,
                                      heads,
                                      types,
                                      lengths,
                                      symbolic_root=True)

                    stats, stats_nopunc, stats_root, num_inst = parser_bpe.eval(
                        word,
                        pos,
                        heads_pred,
                        types_pred,
                        heads,
                        types,
                        word_alphabet,
                        pos_alphabet,
                        lengths,
                        punct_set=punct_set,
                        symbolic_root=True)
                    ucorr, lcorr, total, ucm, lcm = stats
                    ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                    corr_root, total_root = stats_root

                    test_ucorrect += ucorr
                    test_lcorrect += lcorr
                    test_total += total
                    test_ucomlpete_match += ucm
                    test_lcomplete_match += lcm

                    test_ucorrect_nopunc += ucorr_nopunc
                    test_lcorrect_nopunc += lcorr_nopunc
                    test_total_nopunc += total_nopunc
                    test_ucomlpete_match_nopunc += ucm_nopunc
                    test_lcomplete_match_nopunc += lcm_nopunc

                    test_root_correct += corr_root
                    test_total_root += total_root

                    test_total_inst += num_inst

            pred_writer.close()
            gold_writer.close()
        else:
            if dev_ucorr_nopunc * 100 / dev_total_nopunc < dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5 or patient >= schedule:
                # network = torch.load(model_name)
                network.load_state_dict(torch.load(model_name))
                lr = lr * decay_rate
                # = generate_optimizer(opt, lr, network.named_parameters())
                optim = generate_bert_optimizer(opt, lr, network)
                patient = 0
                decay += 1
                if decay % double_schedule_decay == 0:
                    schedule *= 2
            else:
                patient += 1

        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect, dev_lcorrect, dev_total,
               dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total,
               dev_ucomlpete_match * 100 / dev_total_inst,
               dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
        print(
            'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
               dev_ucorrect_nopunc * 100 / dev_total_nopunc,
               dev_lcorrect_nopunc * 100 / dev_total_nopunc,
               dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
               dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch))
        print('best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (dev_root_correct, dev_total_root,
               dev_root_correct * 100 / dev_total_root, best_epoch))
        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        if test_path:
            print(
                'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                % (test_ucorrect, test_lcorrect, test_total, test_ucorrect *
                   100 / test_total, test_lcorrect * 100 / test_total,
                   test_ucomlpete_match * 100 / test_total_inst,
                   test_lcomplete_match * 100 / test_total_inst, best_epoch))
            print(
                'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
                %
                (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
                 test_ucorrect_nopunc * 100 / test_total_nopunc,
                 test_lcorrect_nopunc * 100 / test_total_nopunc,
                 test_ucomlpete_match_nopunc * 100 / test_total_inst,
                 test_lcomplete_match_nopunc * 100 / test_total_inst,
                 best_epoch))
            print(
                'best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)'
                % (test_root_correct, test_total_root,
                   test_root_correct * 100 / test_total_root, best_epoch))
            print(
                '============================================================================================================================'
            )

        if decay == max_decay:
            break

    def save_result():
        result_path = model_name + '.result.txt'
        best_dev_Punc = 'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (
            dev_ucorrect, dev_lcorrect, dev_total,
            dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total,
            dev_ucomlpete_match * 100 / dev_total_inst,
            dev_lcomplete_match * 100 / dev_total_inst, best_epoch)
        best_dev_noPunc = 'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (
            dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
            dev_ucorrect_nopunc * 100 / dev_total_nopunc,
            dev_lcorrect_nopunc * 100 / dev_total_nopunc,
            dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
            dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch)
        best_dev_Root = 'best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' % (
            dev_root_correct, dev_total_root,
            dev_root_correct * 100 / dev_total_root, best_epoch)
        f = open(result_path, 'w')
        f.write(str(best_dev_Punc.encode('utf-8')) + '\n')
        f.write(str(best_dev_noPunc.encode('utf-8')) + '\n')
        f.write(str(best_dev_Root.encode('utf-8')))
        f.close()

    save_result()
def biaffine(model_path, model_name, pre_model_path, pre_model_name, use_gpu, logger, args):
    alphabet_path = os.path.join(pre_model_path, 'alphabets/')
    logger.info("Alphabet Path: %s" % alphabet_path)
    pre_model_name = os.path.join(pre_model_path, pre_model_name)
    model_name = os.path.join(model_path, model_name)

    # Load pre-created alphabets
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet, max_sent_length = conllx_data.create_alphabets(
        alphabet_path, None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info('use gpu: %s' % (use_gpu))

    if args.test_lang:
        extra_embed = args.embed_dir + ("wiki.multi.%s.vec" % args.test_lang)
        extra_word_dict, _ = load_embedding_dict('word2vec', extra_embed)
        test_path = args.data_dir + args.test_lang + '_test.conllu'
        extra_embeds_arr = augment_with_extra_embedding(word_alphabet, extra_word_dict, test_path, logger)
    else:
        extra_embeds_arr = []
        for language in args.langs:
            extra_embed = args.embed_dir + ("wiki.multi.%s.vec" % language)
            extra_word_dict, _ = load_embedding_dict('word2vec', extra_embed)

            test_path = args.data_dir + language + '_train.conllu'
            embeds_arr1 = augment_with_extra_embedding(word_alphabet, extra_word_dict, test_path, logger)
            test_path = args.data_dir + language + '_dev.conllu'
            embeds_arr2 = augment_with_extra_embedding(word_alphabet, extra_word_dict, test_path, logger)
            test_path = args.data_dir + language + '_test.conllu'
            embeds_arr3 = augment_with_extra_embedding(word_alphabet, extra_word_dict, test_path, logger)
            extra_embeds_arr.extend(embeds_arr1 + embeds_arr2 + embeds_arr3)

    # ------------------------------------------------------------------------- #
    # --------------------- Loading model ------------------------------------- #

    def load_model_arguments_from_json():
        arguments = json.load(open(arg_path, 'r'))
        return arguments['args'], arguments['kwargs']

    arg_path = pre_model_name + '.arg.json'
    margs, kwargs = load_model_arguments_from_json()
    network = BiRecurrentConvBiAffine(use_gpu=use_gpu, *margs, **kwargs)
    network.load_state_dict(torch.load(pre_model_name))
    args.use_bert = kwargs.get('use_bert', False)

    #
    augment_network_embed(word_alphabet.size(), network, extra_embeds_arr)

    network.eval()
    logger.info('model: %s' % pre_model_name)

    # Freeze the network
    for p in network.parameters():
        p.requires_grad = False

    nclass = args.nclass
    classifier = nn.Sequential(
        nn.Linear(network.encoder.output_dim, 512),
        nn.Linear(512, nclass)
    )

    if use_gpu:
        network.cuda()
        classifier.cuda()
    else:
        network.cpu()
        classifier.cpu()

    batch_size = args.batch_size

    # ===== the reading
    def _read_one(path, is_train=False, max_size=None):
        lang_id = guess_language_id(path)
        logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id))
        one_data = conllx_data.read_data_to_variable(path, word_alphabet, char_alphabet, pos_alphabet,
                                                     type_alphabet, use_gpu=use_gpu, volatile=(not is_train),
                                                     use_bert=args.use_bert, symbolic_root=True, lang_id=lang_id,
                                                     max_size=max_size)
        return one_data

    def compute_accuracy(data, lang_idx):
        total_corr, total = 0, 0
        classifier.eval()
        with torch.no_grad():
            for batch in conllx_data.iterate_batch_variable(data, batch_size):
                word, char, pos, _, _, masks, lengths, bert_inputs = batch
                if use_gpu:
                    word = word.cuda()
                    char = char.cuda()
                    pos = pos.cuda()
                    masks = masks.cuda()
                    lengths = lengths.cuda()
                    if bert_inputs[0] is not None:
                        bert_inputs[0] = bert_inputs[0].cuda()
                        bert_inputs[1] = bert_inputs[1].cuda()
                        bert_inputs[2] = bert_inputs[2].cuda()

                output = network.forward(word, char, pos, input_bert=bert_inputs,
                                         mask=masks, length=lengths, hx=None)
                output = output['output'].detach()

                if args.train_level == 'word':
                    output = classifier(output)
                    output = output.contiguous().view(-1, output.size(2))
                else:
                    output = torch.mean(output, dim=1)
                    output = classifier(output)

                preds = output.max(1)[1].cpu()
                labels = torch.LongTensor([lang_idx])
                labels = labels.expand(*preds.size())
                n_correct = preds.eq(labels).sum().item()
                total_corr += n_correct
                total += output.size(0)

            return {'total_corr': total_corr, 'total': total}

    if args.test_lang:
        classifier.load_state_dict(torch.load(model_name))
        path = args.data_dir + args.test_lang + '_train.conllu'
        test_data = _read_one(path)

        # TODO: fixed indexing is not GOOD
        lang_idx = 0 if args.test_lang == args.src_lang else 1
        result = compute_accuracy(test_data, lang_idx)
        accuracy = (result['total_corr'] * 100.0) / result['total']
        logger.info('[Classifier performance] Language: %s || accuracy: %.2f%%' % (args.test_lang, accuracy))

    else:
        # if output directory doesn't exist, create it
        if not os.path.exists(args.model_path):
            os.makedirs(args.model_path)

        # --------------------- Loading data -------------------------------------- #
        train_data = dict()
        dev_data = dict()
        test_data = dict()
        num_data = dict()
        lang_ids = dict()
        reverse_lang_ids = dict()

        # loading language data
        for language in args.langs:
            lang_ids[language] = len(lang_ids)
            reverse_lang_ids[lang_ids[language]] = language

            train_path = args.data_dir + language + '_train.conllu'
            # Utilize at most 10000 examples
            tmp_data = _read_one(train_path, max_size=10000)
            num_data[language] = sum(tmp_data[1])
            train_data[language] = tmp_data

            dev_path = args.data_dir + language + '_dev.conllu'
            tmp_data = _read_one(dev_path)
            dev_data[language] = tmp_data

            test_path = args.data_dir + language + '_test.conllu'
            tmp_data = _read_one(test_path)
            test_data[language] = tmp_data

        # ------------------------------------------------------------------------- #

        optim = torch.optim.Adam(classifier.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        def compute_loss(lang_name, land_idx):
            word, char, pos, _, _, masks, lengths, bert_inputs = conllx_data.get_batch_variable(train_data[lang_name],
                                                                                                batch_size,
                                                                                                unk_replace=0.5)

            if use_gpu:
                word = word.cuda()
                char = char.cuda()
                pos = pos.cuda()
                masks = masks.cuda()
                lengths = lengths.cuda()
                if bert_inputs[0] is not None:
                    bert_inputs[0] = bert_inputs[0].cuda()
                    bert_inputs[1] = bert_inputs[1].cuda()
                    bert_inputs[2] = bert_inputs[2].cuda()

            output = network.forward(word, char, pos, input_bert=bert_inputs,
                                     mask=masks, length=lengths, hx=None)
            output = output['output'].detach()

            if args.train_level == 'word':
                output = classifier(output)
                output = output.contiguous().view(-1, output.size(2))
            else:
                output = torch.mean(output, dim=1)
                output = classifier(output)

            labels = torch.empty(output.size(0)).fill_(land_idx).type_as(output).long()
            loss = criterion(output, labels)
            return loss

        # ---------------------- Form the mini-batches -------------------------- #
        num_batches = 0
        batch_lang_labels = []
        for lang in args.langs:
            nbatches = num_data[lang] // batch_size + 1
            batch_lang_labels.extend([lang] * nbatches)
            num_batches += nbatches

        assert len(batch_lang_labels) == num_batches
        # ------------------------------------------------------------------------- #

        best_dev_accuracy = 0
        patience = 0
        for epoch in range(1, args.num_epochs + 1):
            # shuffling the data
            lang_in_batch = copy.copy(batch_lang_labels)
            random.shuffle(lang_in_batch)

            classifier.train()
            for batch in range(1, num_batches + 1):
                lang_name = lang_in_batch[batch - 1]
                lang_id = lang_ids.get(lang_name)

                loss = compute_loss(lang_name, lang_id)
                loss.backward()
                optim.step()

            # Validation
            avg_acc = dict()
            for dev_lang in dev_data.keys():
                lang_idx = lang_ids.get(dev_lang)
                result = compute_accuracy(dev_data[dev_lang], lang_idx)
                accuracy = (result['total_corr'] * 100.0) / result['total']
                avg_acc[dev_lang] = accuracy

            acc = ', '.join('%s: %.2f' % (key, val) for (key, val) in avg_acc.items())
            logger.info('Epoch: %d, Performance[%s]' % (epoch, acc))

            avg_acc = sum(avg_acc.values()) / len(avg_acc)
            if best_dev_accuracy < avg_acc:
                best_dev_accuracy = avg_acc
                patience = 0
                state_dict = classifier.state_dict()
                torch.save(state_dict, model_name)
            else:
                patience += 1

            if patience >= 5:
                break

        # Testing
        logger.info('Testing model %s' % pre_model_name)
        total_corr, total = 0, 0
        for test_lang in UD_languages:
            if test_lang in test_data:
                lang_idx = lang_ids.get(test_lang)
                result = compute_accuracy(test_data[test_lang], lang_idx)
                accuracy = (result['total_corr'] * 100.0) / result['total']
                print('[LANG]: %s, [ACC]: %.2f' % (test_lang.upper(), accuracy))
                total_corr += result['total_corr']
                total += result['total']
        print('[Avg. Performance]: %.2f' % ((total_corr * 100.0) / total))
Пример #12
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with graph-based parsing')
    args_parser.add_argument('--seed',
                             type=int,
                             default=1234,
                             help='random seed for reproducibility')
    args_parser.add_argument('--mode',
                             choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'],
                             help='architecture of rnn',
                             required=True)
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=1000,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--arc_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--type_space',
                             type=int,
                             default=128,
                             help='Dimension of tag space')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of encoder.')
    args_parser.add_argument('--num_filters',
                             type=int,
                             default=50,
                             help='Number of filters in CNN')
    args_parser.add_argument('--pos',
                             action='store_true',
                             help='use part-of-speech embedding.')
    args_parser.add_argument('--char',
                             action='store_true',
                             help='use character embedding and CNN.')
    args_parser.add_argument('--pos_dim',
                             type=int,
                             default=50,
                             help='Dimension of POS embeddings')
    args_parser.add_argument('--char_dim',
                             type=int,
                             default=50,
                             help='Dimension of Character embeddings')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--decode',
                             choices=['mst', 'greedy'],
                             default='mst',
                             help='decoding algorithm')
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.01,
                             help='Learning rate')
    # args_parser.add_argument('--decay_rate', type=float, default=0.05, help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn',
                             nargs='+',
                             type=float,
                             required=True,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    # args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    args_parser.add_argument('--punctuation',
                             nargs='+',
                             type=str,
                             help='List of punctuations')
    args_parser.add_argument(
        '--word_embedding',
        choices=['word2vec', 'glove', 'senna', 'sskip', 'polyglot'],
        help='Embedding for words',
        required=True)
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    args_parser.add_argument('--char_embedding',
                             choices=['random', 'polyglot'],
                             help='Embedding for characters',
                             required=True)
    args_parser.add_argument('--char_path',
                             help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--vocab_path',
                             help='path for prebuilt alphabets.',
                             default=None)
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             required=True)
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             required=True)
    #
    args_parser.add_argument('--no_word',
                             action='store_true',
                             help='do not use word embedding.')
    #
    # lrate schedule with warmup in the first iter.
    args_parser.add_argument('--use_warmup_schedule',
                             action='store_true',
                             help="Use warmup lrate schedule.")
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.75,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--max_decay',
                             type=int,
                             default=9,
                             help='Number of decays before stop')
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument('--double_schedule_decay',
                             type=int,
                             default=5,
                             help='Number of decays to double schedule')
    args_parser.add_argument(
        '--check_dev',
        type=int,
        default=5,
        help='Check development performance in every n\'th iteration')
    # Tansformer encoder
    args_parser.add_argument('--no_CoRNN',
                             action='store_true',
                             help='do not use context RNN.')
    args_parser.add_argument(
        '--trans_hid_size',
        type=int,
        default=1024,
        help='#hidden units in point-wise feed-forward in transformer')
    args_parser.add_argument(
        '--d_k',
        type=int,
        default=64,
        help='d_k for multi-head-attention in transformer encoder')
    args_parser.add_argument(
        '--d_v',
        type=int,
        default=64,
        help='d_v for multi-head-attention in transformer encoder')
    args_parser.add_argument('--multi_head_attn',
                             action='store_true',
                             help='use multi-head-attention.')
    args_parser.add_argument('--num_head',
                             type=int,
                             default=8,
                             help='Value of h in multi-head attention')
    # - positional
    args_parser.add_argument(
        '--enc_use_neg_dist',
        action='store_true',
        help="Use negative distance for enc's relational-distance embedding.")
    args_parser.add_argument(
        '--enc_clip_dist',
        type=int,
        default=0,
        help="The clipping distance for relative position features.")
    args_parser.add_argument('--position_dim',
                             type=int,
                             default=50,
                             help='Dimension of Position embeddings.')
    args_parser.add_argument(
        '--position_embed_num',
        type=int,
        default=200,
        help=
        'Minimum value of position embedding num, which usually is max-sent-length.'
    )
    args_parser.add_argument('--train_position',
                             action='store_true',
                             help='train positional encoding for transformer.')
    #
    args_parser.add_argument(
        '--train_len_thresh',
        type=int,
        default=100,
        help='In training, discard sentences longer than this.')

    #
    args = args_parser.parse_args()

    # fix data-prepare seed
    random.seed(1234)
    np.random.seed(1234)
    # model's seed
    torch.manual_seed(args.seed)

    logger = get_logger("GraphParser")

    mode = args.mode
    obj = args.objective
    decoding = args.decode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    model_path = args.model_path
    model_name = args.model_name
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    arc_space = args.arc_space
    type_space = args.type_space
    num_layers = args.num_layers
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    opt = args.opt
    momentum = 0.9
    betas = (0.9, 0.9)
    eps = args.epsilon
    decay_rate = args.decay_rate
    clip = args.clip
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    punctuation = args.punctuation

    freeze = args.freeze
    word_embedding = args.word_embedding
    word_path = args.word_path

    use_char = args.char
    char_embedding = args.char_embedding
    char_path = args.char_path

    use_pos = args.pos
    pos_dim = args.pos_dim
    word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path)
    char_dict = None
    char_dim = args.char_dim
    if char_embedding != 'random':
        char_dict, char_dim = utils.load_embedding_dict(
            char_embedding, char_path)

    #
    vocab_path = args.vocab_path if args.vocab_path is not None else args.model_path

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(vocab_path, 'alphabets/')
    model_name = os.path.join(model_path, model_name)
    # todo(warn): exactly same for loading vocabs
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet, max_sent_length = conllx_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        max_vocabulary_size=50000,
        embedd_dict=word_dict)

    max_sent_length = max(max_sent_length, args.position_embed_num)

    num_words = word_alphabet.size()
    num_chars = char_alphabet.size()
    num_pos = pos_alphabet.size()
    num_types = type_alphabet.size()

    logger.info("Word Alphabet Size: %d" % num_words)
    logger.info("Character Alphabet Size: %d" % num_chars)
    logger.info("POS Alphabet Size: %d" % num_pos)
    logger.info("Type Alphabet Size: %d" % num_types)

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    # ===== the reading
    def _read_one(path, is_train):
        lang_id = guess_language_id(path)
        logger.info("Reading: guess that the language of file %s is %s." %
                    (path, lang_id))
        one_data = conllx_data.read_data_to_variable(
            path,
            word_alphabet,
            char_alphabet,
            pos_alphabet,
            type_alphabet,
            use_gpu=use_gpu,
            volatile=(not is_train),
            symbolic_root=True,
            lang_id=lang_id,
            len_thresh=(args.train_len_thresh if is_train else 100000))
        return one_data

    data_train = _read_one(train_path, True)
    num_data = sum(data_train[1])

    data_dev = _read_one(dev_path, False)
    data_test = _read_one(test_path, False)
    # =====

    punct_set = None
    if punctuation is not None:
        punct_set = set(punctuation)
        logger.info("punctuations(%d): %s" %
                    (len(punct_set), ' '.join(punct_set)))

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / word_dim)
        table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.zeros([1, word_dim]).astype(
            np.float32) if freeze else np.random.uniform(
                -scale, scale, [1, word_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in word_dict:
                embedding = word_dict[word]
            elif word.lower() in word_dict:
                embedding = word_dict[word.lower()]
            else:
                embedding = np.zeros([1, word_dim]).astype(
                    np.float32) if freeze else np.random.uniform(
                        -scale, scale, [1, word_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('word OOV: %d' % oov)
        return torch.from_numpy(table)

    def construct_char_embedding_table():
        if char_dict is None:
            return None

        scale = np.sqrt(3.0 / char_dim)
        table = np.empty([num_chars, char_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, char_dim]).astype(np.float32)
        oov = 0
        for char, index, in char_alphabet.items():
            if char in char_dict:
                embedding = char_dict[char]
            else:
                embedding = np.random.uniform(-scale, scale,
                                              [1, char_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('character OOV: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    window = 3
    if obj == 'cross_entropy':
        network = BiRecurrentConvBiAffine(
            word_dim,
            num_words,
            char_dim,
            num_chars,
            pos_dim,
            num_pos,
            num_filters,
            window,
            mode,
            hidden_size,
            num_layers,
            num_types,
            arc_space,
            type_space,
            embedd_word=word_table,
            embedd_char=char_table,
            p_in=p_in,
            p_out=p_out,
            p_rnn=p_rnn,
            biaffine=True,
            pos=use_pos,
            char=use_char,
            train_position=args.train_position,
            use_con_rnn=(not args.no_CoRNN),
            trans_hid_size=args.trans_hid_size,
            d_k=args.d_k,
            d_v=args.d_v,
            multi_head_attn=args.multi_head_attn,
            num_head=args.num_head,
            enc_use_neg_dist=args.enc_use_neg_dist,
            enc_clip_dist=args.enc_clip_dist,
            position_dim=args.position_dim,
            max_sent_length=max_sent_length,
            use_gpu=use_gpu,
            no_word=args.no_word)

    elif obj == 'crf':
        raise NotImplementedError
    else:
        raise RuntimeError('Unknown objective: %s' % obj)

    def save_args():
        arg_path = model_name + '.arg.json'
        arguments = [
            word_dim, num_words, char_dim, num_chars, pos_dim, num_pos,
            num_filters, window, mode, hidden_size, num_layers, num_types,
            arc_space, type_space
        ]
        kwargs = {
            'p_in': p_in,
            'p_out': p_out,
            'p_rnn': p_rnn,
            'biaffine': True,
            'pos': use_pos,
            'char': use_char,
            'train_position': args.train_position,
            'use_con_rnn': (not args.no_CoRNN),
            'trans_hid_size': args.trans_hid_size,
            'd_k': args.d_k,
            'd_v': args.d_v,
            'multi_head_attn': args.multi_head_attn,
            'num_head': args.num_head,
            'enc_use_neg_dist': args.enc_use_neg_dist,
            'enc_clip_dist': args.enc_clip_dist,
            'position_dim': args.position_dim,
            'max_sent_length': max_sent_length,
            'no_word': args.no_word
        }
        json.dump({
            'args': arguments,
            'kwargs': kwargs
        },
                  open(arg_path, 'w'),
                  indent=4)

    if freeze:
        network.word_embedd.freeze()

    if use_gpu:
        network.cuda()

    save_args()

    pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)
    gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet,
                               type_alphabet)

    def generate_optimizer(opt, lr, params):
        params = filter(lambda param: param.requires_grad, params)
        if opt == 'adam':
            return Adam(params,
                        lr=lr,
                        betas=betas,
                        weight_decay=gamma,
                        eps=eps)
        elif opt == 'sgd':
            return SGD(params,
                       lr=lr,
                       momentum=momentum,
                       weight_decay=gamma,
                       nesterov=True)
        elif opt == 'adamax':
            return Adamax(params,
                          lr=lr,
                          betas=betas,
                          weight_decay=gamma,
                          eps=eps)
        else:
            raise ValueError('Unknown optimization algorithm: %s' % opt)

    lr = learning_rate
    optim = generate_optimizer(opt, lr, network.parameters())
    opt_info = 'opt: %s, ' % opt
    if opt == 'adam':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)
    elif opt == 'sgd':
        opt_info += 'momentum=%.2f' % momentum
    elif opt == 'adamax':
        opt_info += 'betas=%s, eps=%.1e' % (betas, eps)

    word_status = 'frozen' if freeze else 'fine tune'
    char_status = 'enabled' if use_char else 'disabled'
    pos_status = 'enabled' if use_pos else 'disabled'
    logger.info(
        "Embedding dim: word=%d (%s), char=%d (%s), pos=%d (%s)" %
        (word_dim, word_status, char_dim, char_status, pos_dim, pos_status))
    logger.info("CNN: filter=%d, kernel=%d" % (num_filters, window))
    logger.info(
        "RNN: %s, num_layer=%d, hidden=%d, arc_space=%d, type_space=%d" %
        (mode, num_layers, hidden_size, arc_space, type_space))
    logger.info(
        "train: obj: %s, l2: %f, (#data: %d, batch: %d, clip: %.2f, unk replace: %.2f)"
        % (obj, gamma, num_data, batch_size, clip, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))
    logger.info("decoding algorithm: %s" % decoding)
    logger.info(opt_info)

    num_batches = num_data / batch_size + 1
    dev_ucorrect = 0.0
    dev_lcorrect = 0.0
    dev_ucomlpete_match = 0.0
    dev_lcomplete_match = 0.0

    dev_ucorrect_nopunc = 0.0
    dev_lcorrect_nopunc = 0.0
    dev_ucomlpete_match_nopunc = 0.0
    dev_lcomplete_match_nopunc = 0.0
    dev_root_correct = 0.0

    best_epoch = 0

    test_ucorrect = 0.0
    test_lcorrect = 0.0
    test_ucomlpete_match = 0.0
    test_lcomplete_match = 0.0

    test_ucorrect_nopunc = 0.0
    test_lcorrect_nopunc = 0.0
    test_ucomlpete_match_nopunc = 0.0
    test_lcomplete_match_nopunc = 0.0
    test_root_correct = 0.0
    test_total = 0
    test_total_nopunc = 0
    test_total_inst = 0
    test_total_root = 0

    if decoding == 'greedy':
        decode = network.decode
    elif decoding == 'mst':
        decode = network.decode_mst
    else:
        raise ValueError('Unknown decoding algorithm: %s' % decoding)

    patient = 0
    decay = 0
    max_decay = args.max_decay
    double_schedule_decay = args.double_schedule_decay

    # lrate schedule
    step_num = 0
    use_warmup_schedule = args.use_warmup_schedule
    warmup_factor = (lr + 0.) / num_batches

    if use_warmup_schedule:
        logger.info("Use warmup lrate for the first epoch, from 0 up to %s." %
                    (lr, ))
    #

    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s, optim: %s, learning rate=%.6f, eps=%.1e, decay rate=%.2f (schedule=%d, patient=%d, decay=%d)): '
            %
            (epoch, mode, opt, lr, eps, decay_rate, schedule, patient, decay))
        train_err = 0.
        train_err_arc = 0.
        train_err_type = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            # lrate schedule (before each step)
            step_num += 1
            if use_warmup_schedule and epoch <= 1:
                cur_lrate = warmup_factor * step_num
                # set lr
                for param_group in optim.param_groups:
                    param_group['lr'] = cur_lrate
            #
            word, char, pos, heads, types, masks, lengths = conllx_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss_arc, loss_type = network.loss(word,
                                               char,
                                               pos,
                                               heads,
                                               types,
                                               mask=masks,
                                               length=lengths)
            loss = loss_arc + loss_type
            loss.backward()
            clip_grad_norm(network.parameters(), clip)
            optim.step()

            num_inst = word.size(
                0) if obj == 'crf' else masks.data.sum() - word.size(0)
            train_err += loss.data[0] * num_inst
            train_err_arc += loss_arc.data[0] * num_inst
            train_err_type += loss_type.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 10 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, arc: %.4f, type: %.4f, time left: %.2fs' % (
                    batch, num_batches, train_err / train_total, train_err_arc
                    / train_total, train_err_type / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print(
            'train: %d loss: %.4f, arc: %.4f, type: %.4f, time: %.2fs' %
            (num_batches, train_err / train_total, train_err_arc / train_total,
             train_err_type / train_total, time.time() - start_time))

        ################################################################################################
        if epoch % args.check_dev != 0:
            continue

        # evaluate performance on dev data
        network.eval()
        pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch)
        pred_writer.start(pred_filename)
        gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch)
        gold_writer.start(gold_filename)

        dev_ucorr = 0.0
        dev_lcorr = 0.0
        dev_total = 0
        dev_ucomlpete = 0.0
        dev_lcomplete = 0.0
        dev_ucorr_nopunc = 0.0
        dev_lcorr_nopunc = 0.0
        dev_total_nopunc = 0
        dev_ucomlpete_nopunc = 0.0
        dev_lcomplete_nopunc = 0.0
        dev_root_corr = 0.0
        dev_total_root = 0.0
        dev_total_inst = 0.0
        for batch in conllx_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, heads, types, masks, lengths = batch
            heads_pred, types_pred = decode(
                word,
                char,
                pos,
                mask=masks,
                length=lengths,
                leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
            word = word.data.cpu().numpy()
            pos = pos.data.cpu().numpy()
            lengths = lengths.cpu().numpy()
            heads = heads.data.cpu().numpy()
            types = types.data.cpu().numpy()

            pred_writer.write(word,
                              pos,
                              heads_pred,
                              types_pred,
                              lengths,
                              symbolic_root=True)
            gold_writer.write(word,
                              pos,
                              heads,
                              types,
                              lengths,
                              symbolic_root=True)

            stats, stats_nopunc, stats_root, num_inst = parser.eval(
                word,
                pos,
                heads_pred,
                types_pred,
                heads,
                types,
                word_alphabet,
                pos_alphabet,
                lengths,
                punct_set=punct_set,
                symbolic_root=True)
            ucorr, lcorr, total, ucm, lcm = stats
            ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
            corr_root, total_root = stats_root

            dev_ucorr += ucorr
            dev_lcorr += lcorr
            dev_total += total
            dev_ucomlpete += ucm
            dev_lcomplete += lcm

            dev_ucorr_nopunc += ucorr_nopunc
            dev_lcorr_nopunc += lcorr_nopunc
            dev_total_nopunc += total_nopunc
            dev_ucomlpete_nopunc += ucm_nopunc
            dev_lcomplete_nopunc += lcm_nopunc

            dev_root_corr += corr_root
            dev_total_root += total_root

            dev_total_inst += num_inst

        pred_writer.close()
        gold_writer.close()
        print(
            'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total,
               dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 /
               dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
        print(
            'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%'
            % (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
               dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc *
               100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 /
               dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
        print('Root: corr: %d, total: %d, acc: %.2f%%' %
              (dev_root_corr, dev_total_root,
               dev_root_corr * 100 / dev_total_root))

        if dev_lcorrect_nopunc < dev_lcorr_nopunc or (
                dev_lcorrect_nopunc == dev_lcorr_nopunc
                and dev_ucorrect_nopunc < dev_ucorr_nopunc):
            dev_ucorrect_nopunc = dev_ucorr_nopunc
            dev_lcorrect_nopunc = dev_lcorr_nopunc
            dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc
            dev_lcomplete_match_nopunc = dev_lcomplete_nopunc

            dev_ucorrect = dev_ucorr
            dev_lcorrect = dev_lcorr
            dev_ucomlpete_match = dev_ucomlpete
            dev_lcomplete_match = dev_lcomplete

            dev_root_correct = dev_root_corr

            best_epoch = epoch
            patient = 0
            # torch.save(network, model_name)
            torch.save(network.state_dict(), model_name)

            pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch)
            pred_writer.start(pred_filename)
            gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch)
            gold_writer.start(gold_filename)

            test_ucorrect = 0.0
            test_lcorrect = 0.0
            test_ucomlpete_match = 0.0
            test_lcomplete_match = 0.0
            test_total = 0

            test_ucorrect_nopunc = 0.0
            test_lcorrect_nopunc = 0.0
            test_ucomlpete_match_nopunc = 0.0
            test_lcomplete_match_nopunc = 0.0
            test_total_nopunc = 0
            test_total_inst = 0

            test_root_correct = 0.0
            test_total_root = 0
            for batch in conllx_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, pos, heads, types, masks, lengths = batch
                heads_pred, types_pred = decode(
                    word,
                    char,
                    pos,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                word = word.data.cpu().numpy()
                pos = pos.data.cpu().numpy()
                lengths = lengths.cpu().numpy()
                heads = heads.data.cpu().numpy()
                types = types.data.cpu().numpy()

                pred_writer.write(word,
                                  pos,
                                  heads_pred,
                                  types_pred,
                                  lengths,
                                  symbolic_root=True)
                gold_writer.write(word,
                                  pos,
                                  heads,
                                  types,
                                  lengths,
                                  symbolic_root=True)

                stats, stats_nopunc, stats_root, num_inst = parser.eval(
                    word,
                    pos,
                    heads_pred,
                    types_pred,
                    heads,
                    types,
                    word_alphabet,
                    pos_alphabet,
                    lengths,
                    punct_set=punct_set,
                    symbolic_root=True)
                ucorr, lcorr, total, ucm, lcm = stats
                ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc
                corr_root, total_root = stats_root

                test_ucorrect += ucorr
                test_lcorrect += lcorr
                test_total += total
                test_ucomlpete_match += ucm
                test_lcomplete_match += lcm

                test_ucorrect_nopunc += ucorr_nopunc
                test_lcorrect_nopunc += lcorr_nopunc
                test_total_nopunc += total_nopunc
                test_ucomlpete_match_nopunc += ucm_nopunc
                test_lcomplete_match_nopunc += lcm_nopunc

                test_root_correct += corr_root
                test_total_root += total_root

                test_total_inst += num_inst

            pred_writer.close()
            gold_writer.close()
        else:
            if dev_ucorr_nopunc * 100 / dev_total_nopunc < dev_ucorrect_nopunc * 100 / dev_total_nopunc - 5 or patient >= schedule:
                # network = torch.load(model_name)
                network.load_state_dict(torch.load(model_name))
                lr = lr * decay_rate
                optim = generate_optimizer(opt, lr, network.parameters())

                if decoding == 'greedy':
                    decode = network.decode
                elif decoding == 'mst':
                    decode = network.decode_mst
                else:
                    raise ValueError('Unknown decoding algorithm: %s' %
                                     decoding)

                patient = 0
                decay += 1
                if decay % double_schedule_decay == 0:
                    schedule *= 2
            else:
                patient += 1

        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best dev  W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect, dev_lcorrect, dev_total,
               dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total,
               dev_ucomlpete_match * 100 / dev_total_inst,
               dev_lcomplete_match * 100 / dev_total_inst, best_epoch))
        print(
            'best dev  Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc,
               dev_ucorrect_nopunc * 100 / dev_total_nopunc,
               dev_lcorrect_nopunc * 100 / dev_total_nopunc,
               dev_ucomlpete_match_nopunc * 100 / dev_total_inst,
               dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch))
        print('best dev  Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (dev_root_correct, dev_total_root,
               dev_root_correct * 100 / dev_total_root, best_epoch))
        print(
            '----------------------------------------------------------------------------------------------------------------------------'
        )
        print(
            'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            % (test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 /
               test_total, test_lcorrect * 100 / test_total,
               test_ucomlpete_match * 100 / test_total_inst,
               test_lcomplete_match * 100 / test_total_inst, best_epoch))
        print(
            'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)'
            %
            (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc,
             test_ucorrect_nopunc * 100 / test_total_nopunc,
             test_lcorrect_nopunc * 100 / test_total_nopunc,
             test_ucomlpete_match_nopunc * 100 / test_total_inst,
             test_lcomplete_match_nopunc * 100 / test_total_inst, best_epoch))
        print('best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' %
              (test_root_correct, test_total_root,
               test_root_correct * 100 / test_total_root, best_epoch))
        print(
            '============================================================================================================================'
        )

        if decay == max_decay:
            break