예제 #1
0
def evaluate_tagger(model, batched_dev_data, dev_dep_trees, dev_elmo_hdf5,
                    evaluator, writer, global_step):
    predictions = []
    dev_loss = 0
    total_correct, total_prop = 0, 0

    model.eval()
    for i, batched_tensor in enumerate(batched_dev_data):
        x, y, lengths, weights = batched_tensor
        word_inputs_seqs, predicate_inputs_seqs, syn_label_ids, pes, sentences_ids, answers, input_lengths, masks, padding_answers = \
            batch_data_variable(dev_dep_trees, x, y, lengths, weights)
        elmo_representations = dev_elmo_hdf5.forward(
            sentences_ids,
            word_inputs_seqs.size()[-1], [len(ans) for ans in answers])

        if args.gpu:
            word_inputs_seqs, predicate_inputs_seqs, syn_label_ids, input_lengths, masks, padding_answers = \
                word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda()
            elmo_representations = elmo_representations.cuda()

        output = model.forward(word_inputs_seqs, predicate_inputs_seqs,
                               syn_label_ids, pes, elmo_representations,
                               input_lengths)
        loss = model.compute_loss(output, padding_answers, masks)

        dev_loss += float(loss.data)  # accumulate the dev loss
        output = torch.cat([output[i][:actual_length].view(actual_length, -1) for i, actual_length in \
                            enumerate(input_lengths)], dim=0)
        if args.gpu:  # convert Variable to numpy
            p = output.data.cpu().numpy()
        else:
            p = output.data.numpy()
        p = numpy.argmax(p, axis=1)
        batch_tokens_size = sum(lengths)
        assert p.shape[0] == batch_tokens_size
        np_answer = numpy.concatenate(answers)
        correct = numpy.equal(p, np_answer).sum()
        denominator = batch_tokens_size  # prop numpy.dot(answer, numpy.ones(answer.shape[0]))
        total_correct += int(correct)
        total_prop += int(denominator)

        # split the output of the Model according the order
        last_index, batch_p = 0, []
        for length in input_lengths:
            batch_p.append(p[last_index:last_index + length])
            last_index += length
        predictions.extend(batch_p)

    print('Dev loss={:.6f}'.format(dev_loss))
    evaluator.evaluate(predictions)
    print total_correct, " / ", total_prop, " = ", 100.0 * total_correct / total_prop
    if evaluator.accuracy > evaluator.best_accuracy:
        evaluator.best_accuracy = evaluator.accuracy
    writer.write('{}\t{}\t{:.6f}\t{:.2f}\t{:.2f}\n'.format(
        global_step, time.strftime("%Y-%m-%d %H:%M:%S"), float(dev_loss),
        float(evaluator.accuracy), float(evaluator.best_accuracy)))
    writer.flush()
    if evaluator.has_best:
        model.save(os.path.join(args.model, 'model'))
예제 #2
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path, syntactic_dict_path, input_path):
    with Timer('Data loading'):
        print ('Task: {}'.format(task))
        allow_new_words = True
        print ('Allow new words in test data: {}'.format(allow_new_words))

        # Load word and tag dictionary
        word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)  # word tokens to Dict
        label_dict, syntactic_dict = Dictionary(), Dictionary()
        word_dict.load(word_dict_path)
        label_dict.load(label_dict_path)
        syntactic_dict.load(syntactic_dict_path)
        data = TaggerData(config, [], [], word_dict, label_dict, None, None)
        data.syntactic_dict = syntactic_dict

        # Load test data.
        if task == 'srl':
            test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                input_path,
                config,
                data.word_dict,
                data.label_dict,
                allow_new_words)
        else:
            test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                input_path,
                config,
                data.word_dict,
                data.label_dict,
                allow_new_words)

        print ('Read {} sentences.'.format(len(test_sentences)))

        # Add pre-trained embeddings for new words in the test data.
        # if allow_new_words:
        data.embedding_shapes = emb_shapes
        data.embeddings = emb_inits
        # Batching.
        test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size)

    with Timer('Syntactic Information Extracting'):  # extract the syntactic information from file
        test_dep_trees = SyntacticCONLL()
        test_dep_trees.read_from_file(args.input_dep_trees)
        # generate the syntactic label dict in training corpus
        data.syntactic_dict.accept_new = False
        test_dep_trees.get_syntactic_label_dict(data.syntactic_dict)

    with Timer('Model building and loading'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        model.load(model_path)
        for param in model.parameters():
            print param.size()
        if args.gpu:
            print("Initialize the model with GPU!")
            model = model.cuda()

    with Timer('Running model'):
        scores = []
        model.eval()
        for i, batched_tensor in enumerate(test_data):
            x, y, lengths, weights = batched_tensor
            word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, answers, input_lengths, masks, padding_answers = \
                batch_data_variable(test_dep_trees, None, x, y, lengths, weights)

            if args.gpu:
                word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \
                padding_answers = \
                    word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \
                    input_lengths.cuda(), masks.cuda(), padding_answers.cuda()

            sc = model.forward(word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, input_lengths)
            sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy()
            sc = [sc[j] for j in range(sc.shape[0])]
            scores.extend(sc)

    return scores, data, test_sentences, test_data
예제 #3
0
def get_scores(config, task, model_path, word_dict_path, label_dict_path,
               tpf_dict_path, input_path):
    with Timer('Data loading'):
        print('Task: {}'.format(task))
        allow_new_words = True
        print('Allow new words in test data: {}'.format(allow_new_words))

        # Load word and tag dictionary
        word_dict = Dictionary(
            padding_token=PADDING_TOKEN,
            unknown_token=UNKNOWN_TOKEN)  # word tokens to Dict
        label_dict = Dictionary()
        tpf_dict = Dictionary()
        word_dict.load(word_dict_path)
        label_dict.load(label_dict_path)
        tpf_dict.load(tpf_dict_path)
        data = TaggerData(config, [], [], word_dict, label_dict, None, None)
        data.tpf2_dict = tpf_dict

        # Load test data.
        if task == 'srl':
            test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)
        else:
            test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data(
                input_path, config, data.word_dict, data.label_dict,
                allow_new_words)

        print('Read {} sentences.'.format(len(test_sentences)))

        # Add pre-trained embeddings for new words in the test data.
        # if allow_new_words:
        data.embedding_shapes = emb_shapes
        data.embeddings = emb_inits
        # Batching.
        test_data = data.get_test_data(test_sentences,
                                       batch_size=config.dev_batch_size)

    with Timer("Get test sentences dict"):
        test_sentences_w_id = []
        for sen in get_srl_sentences(args.input):
            test_sentences_w_id.append(' '.join(sen[1]))
        test_sentences_ids = [int(sen[0][0]) for sen in test_sentences]
        temp = {}
        assert len(test_sentences_w_id) == len(test_sentences_ids)
        for idx, sen in zip(test_sentences_ids, test_sentences_w_id):
            temp[idx] = sen
        test_sentences_w_id = temp

    with Timer("Loading ELMO"):
        test_elmo_hdf5 = hdf5_reader()
        test_elmo_hdf5.read_from_file(args.input_elmo, test_sentences_w_id)

    with Timer('Syntactic Information Extracting'
               ):  # extract the syntactic information from file
        test_dep_trees = SyntacticCONLL()
        test_dep_trees.read_from_file(args.input_dep_trees)

    with Timer("TPF2 generating..."):
        # generate the tree-based position features according the Dependency Tree.
        data.tpf2_dict.accept_new = False
        test_tpf2 = test_dep_trees.get_tpf2_dict(data.test_tensors,
                                                 data.tpf2_dict)
        print("Extract {} test TPF2 features".format(len(test_tpf2)))
        assert len(test_tpf2) == len(data.test_tensors)

    with Timer('Model building and loading'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        model.load(model_path)
        for param in model.parameters():
            print(param.size())
        if args.gpu:
            print("Initialize the model with GPU!")
            model = model.cuda()

    with Timer('Running model'):
        scores = []
        model.eval()
        for i, batched_tensor in enumerate(test_data):
            x, y, lengths, weights = batched_tensor
            word_inputs_seqs, predicate_inputs_seqs, tpf_ids, sentences_ids, answers, input_lengths, masks, padding_answers = \
                batch_data_variable(test_tpf2, x, y, lengths, weights)
            elmo_representations = test_elmo_hdf5.forward(
                sentences_ids,
                word_inputs_seqs.size()[-1], [len(ans) for ans in answers])
            if args.gpu:
                word_inputs_seqs, predicate_inputs_seqs, tpf_ids, input_lengths, masks, padding_answers = \
                    word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), tpf_ids.cuda(), input_lengths.cuda(), masks.cuda(), padding_answers.cuda()
                elmo_representations = elmo_representations.cuda()

            sc = model.forward(word_inputs_seqs, predicate_inputs_seqs,
                               tpf_ids, elmo_representations, input_lengths)
            sc = sc.data.cpu().numpy() if args.gpu else sc.data.numpy()
            sc = [sc[j] for j in range(sc.shape[0])]
            scores.extend(sc)

    return scores, data, test_sentences, test_data
예제 #4
0
def train_tagger(args):
    config = configuration.get_config(args.config)
    numpy.random.seed(666)
    torch.manual_seed(666)
    torch.set_printoptions(precision=20)
    ### gpu
    gpu = torch.cuda.is_available()
    if args.gpu and gpu:
        print("GPU available: {}\t GPU ID: {}".format(gpu, args.gpu))
        torch.cuda.manual_seed(666)
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    i = 0
    global_step = 0
    epoch = 0
    train_loss = 0.0

    with Timer('Data loading'):
        vocab_path = args.vocab if args.vocab != '' else None
        label_path = args.labels if args.labels != '' else None
        gold_props_path = args.gold if args.gold != '' else None

        print('Task: {}'.format(args.task))
        if args.task == 'srl':
            # Data and evaluator for SRL.
            data = TaggerData(
                config,
                *reader.get_srl_data(config, args.train, args.dev, vocab_path,
                                     label_path))
            evaluator = SRLEvaluator(data.get_development_data(),
                                     data.label_dict,
                                     gold_props_file=gold_props_path,
                                     use_se_marker=config.use_se_marker,
                                     pred_props_file=None,
                                     word_dict=data.word_dict)
        else:
            print "Not implemented yet!"
            exit()
            # Data and evaluator for PropId.
            data = TaggerData(
                config,
                *reader.get_postag_data(config, args.train, args.dev,
                                        vocab_path, label_path))
            evaluator = PropIdEvaluator(data.get_development_data(),
                                        data.label_dict)

        batched_dev_data = data.get_development_data(
            batch_size=config.dev_batch_size)
        print('Dev data has {} batches.'.format(len(batched_dev_data)))

    with Timer('Syntactic Information Extracting'
               ):  # extract the syntactic information from file
        train_dep_trees = SyntacticCONLL()
        dev_dep_trees = SyntacticCONLL()
        train_dep_trees.read_from_file(args.train_dep_trees)
        dev_dep_trees.read_from_file(args.dev_dep_trees)
        # generate the syntactic label dict in training corpus
        data.syntactic_dict = train_dep_trees.get_syntactic_label_dict()
        data.syntactic_dict.accept_new = False
        dev_dep_trees.get_syntactic_label_dict(data.syntactic_dict)

    with Timer('Preparation'):
        if not os.path.isdir(args.model):
            print('Directory {} does not exist. Creating new.'.format(
                args.model))
            os.makedirs(args.model)
        else:
            if len(os.listdir(args.model)) > 0:
                print ('[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' \
                    .format(args.model))
        shutil.copyfile(args.config, os.path.join(args.model, 'config'))
        # Save word and label dict to model directory.
        data.word_dict.save(os.path.join(args.model, 'word_dict'))
        data.label_dict.save(os.path.join(args.model, 'label_dict'))
        data.syntactic_dict.save(os.path.join(args.model, 'syn_label_dict'))
        writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w')
        writer.write(
            'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n')

    with Timer('Building model'):
        model = BiLSTMTaggerModel(data, config=config, gpu_id=args.gpu)
        if args.gpu:
            print "BiLSTMTaggerModel initialize with Cuda!"
            model = model.cuda()
            if args.gpu != "" and not torch.cuda.is_available():
                raise Exception("No GPU Found!")
                exit()
        for param in model.parameters():
            print param.size()

    optimizer = torch.optim.Adadelta(
        model.parameters(), lr=1.0,
        rho=0.95)  # initialize the optimizer outside the epoch
    batch_position_encoding = position_encoding_init(
        200, 100)  # 0: root, 1, ...n, n+1: padding | max length 200
    while epoch < config.max_epochs:
        with Timer("Epoch%d" % epoch) as timer:
            model.train()
            train_data = data.get_training_data(include_last_batch=True)
            for batched_tensor in train_data:  # for each batch in the training corpus
                x, y, lengths, weights = batched_tensor
                word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, pes, _, input_lengths, masks, padding_answers = \
                    batch_data_variable(train_dep_trees, batch_position_encoding, x, y, lengths, weights)

                if args.gpu:
                    word_inputs_seqs, predicate_inputs_seqs, syn_label_inputs_seqs, input_lengths, masks, \
                        padding_answers = \
                        word_inputs_seqs.cuda(), predicate_inputs_seqs.cuda(), syn_label_inputs_seqs.cuda(), \
                        input_lengths.cuda(), masks.cuda(), padding_answers.cuda()

                optimizer.zero_grad()
                output = model.forward(word_inputs_seqs, predicate_inputs_seqs,
                                       syn_label_inputs_seqs, pes,
                                       input_lengths)
                loss = model.compute_loss(output, padding_answers, masks)
                loss.backward()
                # gradient clipping
                torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
                optimizer.step()

                train_loss += loss.data  # should be tensor not Variable, avoiding the graph accumulates

                i += 1
                global_step += 1
                if i % 400 == 0:
                    timer.tick("{} training steps, loss={:.3f}".format(
                        i, float(train_loss / i)))

            train_loss = train_loss / i
            print("Epoch {}, steps={}, loss={:.3f}".format(
                epoch, i, float(train_loss)))
            i = 0
            epoch += 1
            train_loss = 0.0
            if epoch % config.checkpoint_every_x_epochs == 0:
                with Timer('Evaluation'):
                    evaluate_tagger(model, batch_position_encoding,
                                    batched_dev_data, dev_dep_trees, evaluator,
                                    writer, global_step)

    # Done. :)
    writer.close()