示例#1
0
def test(fm, args):
    test_trees = PhraseTree.load_trees(args.test)
    print('Loaded test trees from {}'.format(args.test))
    network = torch.load(args.model)
    print('Loaded model from: {}'.format(args.model))
    accuracy = Parser.evaluate_corpus(test_trees, fm, network)
    print('Accuracy: {}'.format(accuracy))
示例#2
0
    def vocab_init(fname, verbose=True):
        """
        Learn vocabulary from file of strings.
        """
        word_freq = defaultdict(int)
        tag_freq = defaultdict(int)
        label_freq = defaultdict(int)

        trees = PhraseTree.load_trees(fname)

        for i, tree in enumerate(trees):
            for (word, tag) in tree.sentence:
                word_freq[word] += 1
                tag_freq[tag] += 1

            for action in Parser.gold_actions(tree):
                if action.startswith('label-'):
                    label = action[6:]
                    label_freq[label] += 1

            if verbose:
                print('\rTree {}'.format(i), end='')
                sys.stdout.flush()

        if verbose:
            print('\r', end='')

        words = [
            FeatureMapper.UNK,
            FeatureMapper.START,
            FeatureMapper.STOP,
        ] + sorted(word_freq)
        wdict = OrderedDict((w, i) for (i, w) in enumerate(words))

        tags = [
            FeatureMapper.UNK,
            FeatureMapper.START,
            FeatureMapper.STOP,
        ] + sorted(tag_freq)
        tdict = OrderedDict((t, i) for (i, t) in enumerate(tags))

        labels = sorted(label_freq)
        ldict = OrderedDict((l, i) for (i, l) in enumerate(labels))

        if verbose:
            print('Loading features from {}'.format(fname))
            print('({} words, {} tags, {} nonterminal-chains)'.format(
                len(wdict),
                len(tdict),
                len(ldict),
            ))

        return {
            'wdict': wdict,
            'word_freq': word_freq,
            'tdict': tdict,
            'ldict': ldict,
        }
示例#3
0
 def gold_data_from_file(self, fname):
     """
     Static oracle for file.
     """
     trees = PhraseTree.load_trees(fname)
     result = []
     for tree in trees:
         sentence_data = self.gold_data(tree)
         result.append(sentence_data)
     return result
示例#4
0
 def write_raw_predicted(fname, sentences, fm, network):
     f = open(fname, 'w')
     for sentence in sentences:
         predicted = Parser.parse(sentence, fm, network)
         topped = PhraseTree(
             symbol='TOP',
             children=[predicted],
             sentence=predicted.sentence,
         )
         f.write(str(topped))
         f.write('\n')
     f.close()
示例#5
0
 def write_predicted(fname, trees, fm, network):
     """
     Input trees being used only to carry sentences.
     """
     f = open(fname, 'w')
     accuracy = FScore()
     for tree in trees:
         predicted = Parser.parse(tree.sentence, fm, network)
         local_accuracy = predicted.compare(tree)
         accuracy += local_accuracy
         topped = PhraseTree(
             symbol='TOP',
             children=[predicted],
             sentence=predicted.sentence,
         )
         f.write(str(topped))
         f.write('\n')
     f.close()
     return accuracy
示例#6
0
from core_nlp.models.parser.features import FeatureMapper

from core_nlp.data.phrase_tree import PhraseTree
fm = FeatureMapper.load_json(
    '/Users/qiwang/python-space/nju_nlp_tools/testdata/toy.vocab.json')
test_trees = PhraseTree.load_trees(
    '/Users/qiwang/python-space/nju_nlp_tools/testdata/toy.clean')
#test_trees[0].rotate_tree()
test_trees[0].draw_tree('tree.png')
示例#7
0
def train(fm, args):
    train_data_file = args.train
    dev_data_file = args.dev
    epochs = args.epochs
    batch_size = args.batch_size
    unk_param = args.unk_param
    alpha = args.alpha
    beta = args.beta
    model_save_file = args.model

    print("this is train mode")
    start_time = time.time()

    network = Network(fm, args)

    optimizer = optimize.Adadelta(network.parameters(), eps=1e-7, rho=0.99)
    if GlobalNames.use_gpu:
        network.cuda()

    training_data = fm.gold_data_from_file(train_data_file)
    num_batches = -(-len(training_data) // batch_size)
    print('Loaded {} training sentences ({} batches of size {})!'.format(
        len(training_data),
        num_batches,
        batch_size,
    ))
    parse_every = -(-num_batches // 4)

    dev_trees = PhraseTree.load_trees(dev_data_file)
    print('Loaded {} validation trees!'.format(len(dev_trees)))

    best_acc = FScore()

    for epoch in xrange(1, epochs + 1):
        print('........... epoch {} ...........'.format(epoch))

        total_cost = 0.0
        total_states = 0
        training_acc = FScore()

        np.random.shuffle(training_data)

        for b in xrange(num_batches):
            network.zero_grad()
            batch = training_data[(b * batch_size): ((b + 1) * batch_size)]
            batch_loss = None
            for example in batch:
                example_Loss, example_states, acc = Parser.exploration(example, fm, network, alpha, beta, unk_param)
                total_states += example_states
                if batch_loss is not None:
                    batch_loss += example_Loss
                else:
                    batch_loss = example_Loss
                training_acc += acc
            if GlobalNames.use_gpu:
                total_cost += batch_loss.cpu().data.numpy()[0]
            else:
                total_cost += batch_loss.data.numpy()[0]
            batch_loss.backward()
            optimizer.step()

            mean_cost = total_cost / total_states

            print(
                '\rBatch {}  Mean Cost {:.4f} [Train: {}]'.format(
                    b,
                    mean_cost,
                    training_acc,
                ),
                end='',
            )
            sys.stdout.flush()

            if ((b + 1) % parse_every) == 0 or b == (num_batches - 1):
                dev_acc = Parser.evaluate_corpus(
                    dev_trees,
                    fm,
                    network,
                )
                print(' [Dev: {}]'.format(dev_acc))

                if dev_acc > best_acc:
                    best_acc = dev_acc
                    torch.save(network, model_save_file)
                    print(' [saved model: {}]'.format(model_save_file))

        current_time = time.time()
        runmins = (current_time - start_time) / 60.
        print('  Elapsed time: {:.2f}m'.format(runmins))
示例#8
0
        fm = FeatureMapper.load_json(args.vocab)
    elif args.train is not None:
        fm = FeatureMapper(args.train)
        if args.vocab_output is not None:
            fm.save_json(args.vocab_output)
            print('Wrote vocabulary file {}'.format(args.vocab_output))
            sys.exit()
    else:
        print('Must specify either --vocab-file or --train-data.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.model is None:
        print('Must specify --model or (or --write-vocab) parameter.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.test is not None:
        from parser import Parser
        import torch

        test_trees = PhraseTree.load_trees(args.test)
        print('Loaded test trees from {}'.format(args.test))
        network = torch.load(args.model)
        print('Loaded model from: {}'.format(args.model))
        accuracy = Parser.evaluate_corpus(test_trees, fm, network)
        print('Accuracy: {}'.format(accuracy))
    elif args.train is not None:

        train(fm, args)
示例#9
0
def train(fm, args):
    train_data_file = args.train
    dev_data_file = args.dev
    epochs = args.epochs
    batch_size = args.batch_size
    unk_param = args.unk_param
    alpha = args.alpha
    beta = args.beta
    model_save_file = args.model

    print("this is train mode")
    start_time = time.time()

    network = SpanParserNN(fm, args)
    optimizer = optimize.Adadelta(network.parameters(), eps=1e-7, rho=0.99)

    # network.cuda()

    training_data = fm.gold_data_from_file(train_data_file)
    num_batches = -(-len(training_data) // batch_size)
    print('Loaded {} training sentences ({} batches of size {})!'.format(
        len(training_data),
        num_batches,
        batch_size,
    ))
    parse_every = -(-num_batches // 4)

    dev_trees = PhraseTree.load_trees(dev_data_file)
    print('Loaded {} validation trees!'.format(len(dev_trees)))

    best_acc = FScore()

    for epoch in xrange(1, epochs + 1):
        print('........... epoch {} ...........'.format(epoch))

        total_cost = 0.0
        total_states = 0
        training_acc = FScore()

        np.random.shuffle(training_data)

        for b in xrange(num_batches):
            batch = training_data[(b * batch_size):((b + 1) * batch_size)]

            explore = [
                Parser.exploration(
                    example,
                    fm,
                    network,
                    alpha=alpha,
                    beta=beta,
                ) for example in batch
            ]
            for (_, acc) in explore:
                training_acc += acc

            batch = [example for (example, _) in explore]
            sum_loss = np.zeros(1)

            for example in batch:

                ## random UNKing ##
                for (i, w) in enumerate(example['w']):
                    if w <= 2:
                        continue

                    freq = fm.word_freq_list[w]
                    drop_prob = unk_param / (unk_param + freq)
                    r = np.random.random()
                    if r < drop_prob:
                        example['w'][i] = 0

                fwd, back = network.evaluate_word(
                    example['w'],
                    example['t'],
                )

                for (left, right), correct in example['struct_data'].items():
                    scores = network(fwd, back, left, right, 'struct')
                    probs = F.softmax(scores, dim=0)
                    loss = -torch.log(probs[correct])
                    sum_loss += loss.data.numpy()
                    loss.backward(retain_graph=True)

                total_states += len(example['struct_data'])

                for (left, right), correct in example['label_data'].items():
                    scores = network(fwd, back, left, right, 'label')
                    probs = F.softmax(scores, dim=0)
                    loss = -torch.log(probs[correct])
                    sum_loss += loss.data.numpy()
                    loss.backward(retain_graph=True)
                total_states += len(example['label_data'])

            total_cost += sum_loss
            optimizer.step()
            network.zero_grad()
            mean_cost = total_cost / total_states

            print(
                '\rBatch {}  Mean Cost {:.4f} [Train: {}]'.format(
                    b,
                    mean_cost,
                    training_acc,
                ),
                end='',
            )
            sys.stdout.flush()

            if ((b + 1) % parse_every) == 0 or b == (num_batches - 1):
                dev_acc = Parser.evaluate_corpus(
                    dev_trees,
                    fm,
                    network,
                )
                print('  [Val: {}]'.format(dev_acc))

                if dev_acc > best_acc:
                    best_acc = dev_acc
                    torch.save(network, model_save_file)
                    print('    [saved model: {}]'.format(model_save_file))

        current_time = time.time()
        runmins = (current_time - start_time) / 60.
        print('  Elapsed time: {:.2f}m'.format(runmins))
示例#10
0
    def label(self, nonterminals=[]):

        for nt in nonterminals:
            (left, right, trees) = self.stack.pop()
            tree = PhraseTree(symbol=nt, children=trees)
            self.stack.append((left, right, [tree]))
示例#11
0
 def shift(self):
     j = self.i  # (index of shifted word)
     treelet = PhraseTree(leaf=j)
     self.stack.append((j, j, [treelet]))
     self.i += 1