Пример #1
0
def eval_files(gold_file, test_file):
    gold_trees = PhraseTree.load_treefile(gold_file)
    test_trees = PhraseTree.load_treefile(test_file)
    accuracy = FScore()

    match_gold_trees = []
    match_pred_trees = []
    umatch_gold_trees = []
    umatch_pred_trees = []
    for gold, test in zip(gold_trees, test_trees):
        if len(gold) == len(test):
            match_gold_trees.append(gold)
            match_pred_trees.append(test)
        else:
            umatch_gold_trees.append(gold)
            umatch_pred_trees.append(test)

    print("***eval matched pair***")
    struct, label = eval_trees(match_gold_trees,
                               match_pred_trees,
                               verbose=True)
    accuracy += label
    print("***eval unmatched pair***")
    _, label = eval_trees(umatch_gold_trees, umatch_pred_trees, verbose=True)
    accuracy += label
    print("sum struct score:", struct + _)
    return accuracy
Пример #2
0
 def label(self, nonterminals=[]):
     for nt in nonterminals:
         (left, right, trees) = self.stack.pop()
         tree = PhraseTree(symbol=nt,
                           children=trees,
                           sentence=self.sentence)
         self.stack.append((left, right, [tree]))
 def write_predicted(fname,
                     test_trees,
                     fm,
                     network,
                     batch_size,
                     k=5,
                     ap=2.5,
                     mc=3):
     """
     Input trees being used only to carry sentences.
     """
     f = open(fname, 'w')
     for i in range(0, len(test_trees), batch_size):
         batch = [
             test_trees[i + j].sentence for j in range(batch_size)
             if i + j < len(trees)
         ]
         batch_predicted = parse_batch_variable_beam_stream(
             batch, fm, network, k, ap, mc)
         for predicted in batch_predicted:
             topped = PhraseTree(
                 symbol='TOP',
                 children=[predicted],
                 sentence=predicted.sentence,
             )
             f.write(str(topped))
             f.write('\n')
     f.close()
Пример #4
0
    def vocab_init(fname, verbose=True):
        """
        Learn vocabulary from file of strings.
        """
        tag_freq = defaultdict(int)

        trees = PhraseTree.load_treefile(fname)

        for i, tree in enumerate(trees):
            for (word, tag) in tree.sentence:
                tag_freq[tag] += 1

            if verbose:
                print('\rTree {}'.format(i), end='')
                sys.stdout.flush()

        if verbose:
            print('\r', end='')

        tags = ['XX'] + sorted(tag_freq)
        tdict = OrderedDict((t, i) for (i, t) in enumerate(tags))

        if verbose:
            print('Loading features from {}'.format(fname))
            print('( {} tags)'.format(len(tdict), ))

        return {
            'tdict': tdict,
        }
Пример #5
0
 def shift(self):
     j = self.i  # (index of shifted word)
     treelet = PhraseTree(symbol=self.sentence[j][1],
                          leaf=j,
                          sentence=self.sentence)
     self.stack.append((j, j, [treelet]))
     self.i += 1
    def vocab_init(fname, verbose=True):
        """
        Learn vocabulary from file of strings.
        """
        word_freq = defaultdict(int)
        tag_freq = defaultdict(int)
        label_freq = defaultdict(int)

        trees = PhraseTree.load_treefile(fname)

        for i, tree in enumerate(trees):
            for (word, tag) in tree.sentence:
                word_freq[word] += 1
                tag_freq[tag] += 1

            for action in Parser.gold_actions(tree):
                if action.startswith('label-'):
                    label = action[6:]
                    label_freq[label] += 1 

            if verbose:
                print('\rTree {}'.format(i), end='')
                sys.stdout.flush()

        if verbose:
            print('\r', end='')


        words = [
            FeatureMapper.UNK,
            FeatureMapper.START,
            FeatureMapper.STOP,
        ] + sorted(word_freq)         
        wdict = OrderedDict((w,i) for (i,w) in enumerate(words))

        tags = [
            FeatureMapper.UNK,
            FeatureMapper.START,
            FeatureMapper.STOP,
        ] +  sorted(tag_freq)
        tdict = OrderedDict((t,i) for (i,t) in enumerate(tags))

        labels = sorted(label_freq)
        ldict = OrderedDict((l,i) for (i,l) in enumerate(labels))

        if verbose:
            print('Loading features from {}'.format(fname))
            print('({} words, {} tags, {} nonterminal-chains)'.format(
                len(wdict),
                len(tdict),
                len(ldict),
            ))
        
        return {
            'wdict': wdict,
            'word_freq': word_freq,
            'tdict': tdict,
            'ldict': ldict,
        }
def extract_origin_grammar(tree_file, out_file="grammar.out"):
    grammar_dict = defaultdict(int)
    trees = PhraseTree.load_treefile(tree_file)
    for tree in trees:
        tree.grammar(grammar_dict)
    grammar_list = [grammar for grammar, val in grammar_dict.items()]
    write_docs(fname=out_file, docs=grammar_list)
    return grammar_dict
Пример #8
0
 def gold_data_from_file(self, fname):
     """
     Static oracle for file.
     """
     trees = PhraseTree.load_treefile(fname)
     result = []
     for tree in trees:
         sentence_data = self.gold_data(tree)
         result.append(sentence_data)
     return result
Пример #9
0
 def write_predicted(fname, test_trees, fm, network):
     """
     Input trees being used only to carry sentences.
     """
     f = open(fname, 'w')
     for tree in test_trees:
         predicted = Parser.parse(tree.sentence, fm, network)
         topped = PhraseTree(
             symbol='TOP',
             children=[predicted],
             sentence=predicted.sentence,
         )
         f.write(str(topped))
         f.write('\n')
     f.close()
Пример #10
0
 def shift(self):
     j = self.i  # (index of shifted word)
     treelet = PhraseTree(leaf=j)
     self.stack.append((j, j, [treelet]))
     self.i += 1
Пример #11
0
    def train(
        feature_mapper,
        word_dims,
        tag_dims,
        lstm_units,
        hidden_units,
        epochs,
        batch_size,
        train_data_file,
        dev_data_file,
        model_save_file,
        droprate,
        unk_param,
        alpha=1.0,
        beta=0.0,
    ):

        start_time = time.time()

        fm = feature_mapper
        word_count = fm.total_words()
        tag_count = fm.total_tags()

        network = Network(
            word_count=word_count,
            tag_count=tag_count,
            word_dims=word_dims,
            tag_dims=tag_dims,
            lstm_units=lstm_units,
            hidden_units=hidden_units,
            struct_out=2,
            label_out=fm.total_label_actions(),
            droprate=droprate,
        )
        network.init_params()

        print('Hidden units: {},  per-LSTM units: {}'.format(
            hidden_units,
            lstm_units,
        ))
        print('Embeddings: word={}  tag={}'.format(
            (word_count, word_dims),
            (tag_count, tag_dims),
        ))
        print('Dropout rate: {}'.format(droprate))
        print('Parameters initialized in [-0.01, 0.01]')
        print('Random UNKing parameter z = {}'.format(unk_param))
        print('Exploration: alpha={} beta={}'.format(alpha, beta))

        training_data = fm.gold_data_from_file(train_data_file)
        num_batches = -(-len(training_data) // batch_size)
        print('Loaded {} training sentences ({} batches of size {})!'.format(
            len(training_data),
            num_batches,
            batch_size,
        ))
        parse_every = -(-num_batches // 4)

        dev_trees = PhraseTree.load_treefile(dev_data_file)
        print('Loaded {} validation trees!'.format(len(dev_trees)))

        best_acc = FScore()

        for epoch in xrange(1, epochs + 1):
            print('........... epoch {} ...........'.format(epoch))

            total_cost = 0.0
            total_states = 0
            training_acc = FScore()

            np.random.shuffle(training_data)

            for b in xrange(num_batches):
                batch = training_data[(b * batch_size):((b + 1) * batch_size)]

                explore = [
                    Parser.exploration(
                        example,
                        fm,
                        network,
                        alpha=alpha,
                        beta=beta,
                    ) for example in batch
                ]
                for (_, acc) in explore:
                    training_acc += acc

                batch = [example for (example, _) in explore]

                dynet.renew_cg()
                network.prep_params()

                errors = []

                for example in batch:

                    ## random UNKing ##
                    for (i, w) in enumerate(example['w']):
                        if w <= 2:
                            continue

                        freq = fm.word_freq_list[w]
                        drop_prob = unk_param / (unk_param + freq)
                        r = np.random.random()
                        if r < drop_prob:
                            example['w'][i] = 0

                    fwd, back = network.evaluate_recurrent(
                        example['w'],
                        example['t'],
                    )

                    for (left,
                         right), correct in example['struct_data'].items():
                        scores = network.evaluate_struct(
                            fwd, back, left, right)

                        probs = dynet.softmax(scores)
                        loss = -dynet.log(dynet.pick(probs, correct))
                        errors.append(loss)
                    total_states += len(example['struct_data'])

                    for (left,
                         right), correct in example['label_data'].items():
                        scores = network.evaluate_label(fwd, back, left, right)

                        probs = dynet.softmax(scores)
                        loss = -dynet.log(dynet.pick(probs, correct))
                        errors.append(loss)
                    total_states += len(example['label_data'])

                batch_error = dynet.esum(errors)
                total_cost += batch_error.scalar_value()
                batch_error.backward()
                network.trainer.update()

                mean_cost = total_cost / total_states

                print(
                    '\rBatch {}  Mean Cost {:.4f} [Train: {}]'.format(
                        b,
                        mean_cost,
                        training_acc,
                    ),
                    end='',
                )
                sys.stdout.flush()

                if ((b + 1) % parse_every) == 0 or b == (num_batches - 1):
                    dev_acc = Parser.evaluate_corpus(
                        dev_trees,
                        fm,
                        network,
                    )
                    print('  [Val: {}]'.format(dev_acc))

                    if dev_acc > best_acc:
                        best_acc = dev_acc
                        network.save(model_save_file)
                        print('    [saved model: {}]'.format(model_save_file))

            current_time = time.time()
            runmins = (current_time - start_time) / 60.
            print('  Elapsed time: {:.2f}m'.format(runmins))
Пример #12
0
    else:
        print('Must specify either --vocab-file or --train-data.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.model is None:
        print('Must specify --model or (or --write-vocab) parameter.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.test is not None:
        from phrase_tree import PhraseTree
        from network import Network
        from parser import Parser

        test_trees = PhraseTree.load_treefile(args.test)
        print('Loaded test trees from {}'.format(args.test))
        network = Network.load(args.model)
        print('Loaded model from: {}'.format(args.model))
        accuracy = Parser.evaluate_corpus(test_trees, fm, network)
        print('Accuracy: {}'.format(accuracy))
    elif args.train is not None:
        from network import Network

        if args.np_seed is not None:
            import numpy as np
            np.random.seed(args.np_seed)

        print('L2 regularization: {}'.format(args.dynet_l2))

        Network.train(feature_mapper=fm,
Пример #13
0
import argparse

from phrase_tree import PhraseTree, FScore

parser = argparse.ArgumentParser()
parser.add_argument('--gold', type=str)
parser.add_argument('--pred', type=str)
args = parser.parse_args()

gold_trees = PhraseTree.load_treefile(args.gold)
pred_trees = PhraseTree.load_treefile(args.pred)

accuracy = FScore()
for gold, pred in zip(gold_trees, pred_trees):
    local_accuracy = pred.compare(gold)
    accuracy += local_accuracy

print(accuracy)
Пример #14
0
 def gold_data_from_file(self, fname):
     """
     Static oracle for file.
     """
     trees = PhraseTree.load_treefile(fname)
     return self.gold_data_from_trees(trees)
Пример #15
0
    def train(
        feature_mapper,
        word_dims,
        tag_dims,
        lstm_units,
        hidden_units,
        epochs,
        batch_size,
        train_data_file,
        dev_data_file,
        model_save_file,
        droprate,
        unk_param,
        alpha=1.0,
        beta=0.0,
        GPU=None,
    ):

        start_time = time.time()

        fm = feature_mapper
        word_count = fm.total_words()
        tag_count = fm.total_tags()

        network = Network(
            word_count=word_count,
            tag_count=tag_count,
            word_dims=word_dims,
            tag_dims=tag_dims,
            lstm_units=lstm_units,
            hidden_units=hidden_units,
            struct_out=2,
            label_out=fm.total_label_actions(),
            droprate=droprate,
            GPU=GPU,
        )

        f_loss = nn.CrossEntropyLoss(size_average=False)
        if GPU is not None:
            f_loss = f_loss.cuda(GPU)
        random.seed(1)
        torch.manual_seed(1)

        print('Hidden units: {},  per-LSTM units: {}'.format(
            hidden_units,
            lstm_units,
        ))
        print('Embeddings: word={}  tag={}'.format(
            (word_count, word_dims),
            (tag_count, tag_dims),
        ))
        print('Dropout rate: {}'.format(droprate))
        print('Parameters initialized in [-0.01, 0.01]')
        print('Random UNKing parameter z = {}'.format(unk_param))
        print('Exploration: alpha={} beta={}'.format(alpha, beta))

        training_data = fm.gold_data_from_file(train_data_file)
        num_batches = -(-len(training_data) // batch_size)
        print('Loaded {} training sentences ({} batches of size {})!'.format(
            len(training_data),
            num_batches,
            batch_size,
        ))
        parse_every = -(-num_batches // 4)

        dev_trees = PhraseTree.load_treefile(dev_data_file)
        print('Loaded {} validation trees!'.format(len(dev_trees)))

        best_acc = FScore()
        network.init_hidden()

        for epoch in xrange(1, epochs + 1):
            print('........... epoch {} ...........'.format(epoch))

            total_cost = 0.0
            total_states = 0
            training_acc = FScore()

            np.random.shuffle(training_data)

            for b in xrange(num_batches):
                network.trainer.zero_grad()
                batch = training_data[(b * batch_size):((b + 1) * batch_size)]

                explore = [
                    Parser.exploration(
                        example,
                        fm,
                        network,
                        alpha=alpha,
                        beta=beta,
                    ) for example in batch
                ]
                for (_, acc) in explore:
                    training_acc += acc

                batch = [example for (example, _) in explore]
                errors = []

                network.init_hidden()
                for example in batch:

                    ## random UNKing ##
                    for (i, w) in enumerate(example['w']):
                        if w <= 2:
                            continue
                        freq = fm.word_freq_list[w]
                        drop_prob = unk_param / (unk_param + freq)
                        r = np.random.random()
                        if r < drop_prob:
                            example['w'][i] = 0

                    fwd, back = network.evaluate_recurrent(
                        example['w'],
                        example['t'],
                    )

                    indices, targets = [], []
                    for (left,
                         right), correct in example['struct_data'].items():
                        indices.append((left, right))
                        targets.append(correct)
                    """
                    print(example['w'])
                    print(indices)
                    print(targets)
                    raw_input()
                    """

                    targets = autograd.Variable(torch.LongTensor(targets))
                    if network.GPU is not None:
                        targets = targets.cuda(network.GPU)
                    scores = network.evaluate_struct(fwd, back, indices)
                    for i in xrange(len(targets)):
                        score = scores[i]
                        target = targets[i]
                        loss = f_loss(score, target)
                        errors.append(loss)
                    total_states += len(example['struct_data'])

                    indices, targets = [], []
                    for (left,
                         right), correct in example['label_data'].items():
                        indices.append((left, right))
                        targets.append(correct)
                    targets = autograd.Variable(torch.LongTensor(targets))
                    if network.GPU is not None:
                        targets = targets.cuda(network.GPU)
                    scores = network.evaluate_label(fwd, back, indices)
                    for i in xrange(len(targets)):
                        score = scores[i]
                        target = targets[i]
                        loss = f_loss(score, target)
                        errors.append(loss)
                    total_states += len(example['label_data'])

                batch_loss = torch.sum(torch.cat(errors))
                #network.trainer.zero_grad()
                batch_loss.backward()
                network.trainer.step()

                total_cost += batch_loss.data[0]
                mean_cost = (total_cost / total_states)

                print(
                    '\rBatch {}  Mean Cost {:.4f} [Train: {}]'.format(
                        b,
                        mean_cost,
                        training_acc,
                    ),
                    end='',
                )
                sys.stdout.flush()

                if ((b + 1) % parse_every) == 0 or b == (num_batches - 1):
                    dev_acc = Parser.evaluate_corpus(
                        dev_trees,
                        fm,
                        network,
                    )
                    print('  [Val: {}]'.format(dev_acc))

                    if dev_acc > best_acc:
                        best_acc = dev_acc
                        network.save(model_save_file)
                        print('    [saved model: {}]'.format(model_save_file))

            current_time = time.time()
            runmins = (current_time - start_time) / 60.
            print('  Elapsed time: {:.2f}m'.format(runmins))