Exemplo n.º 1
0
    def gold_data_from_file(self, fname):

        sentences = SegSentence.load_sentence_file(fname)
        result = []
        for sentence in sentences:
            sentence_data = self.gold_data(sentence)
            result.append(sentence_data)
        return result
Exemplo n.º 2
0
    def evaluate_corpus(sentences, fm, network):
        accuracy = FScore()
        for sentence in sentences:
            seg_sentence = SegSentence(sentence)
            predicted = Segmenter.segment(seg_sentence, fm, network)
            local_accuracy = predicted.compare(seg_sentence)
            accuracy += local_accuracy

        return accuracy
Exemplo n.º 3
0
    def gold_data(self, sentence):
        seg_sentence = SegSentence(sentence)
        fwd_bigrams, unigrams = self.sentence_sequence(seg_sentence)

        features = Segmenter.training_data(seg_sentence)

        return {
            'segSentence': seg_sentence,
            'fwd_bigrams': fwd_bigrams,
            'unigrams': unigrams,
            'features': features,
        }
Exemplo n.º 4
0
    def write_predicted(fname, test_sentences, fm, network):
        """
        Input sentences being used only to carry sentences.
        """
        start_time = time.time()

        f = open(fname, 'w+')
        for sentence in test_sentences:
            seg_sentence = SegSentence(sentence)
            predicted = Segmenter.segment(seg_sentence, fm, network)
            f.write(predicted.pretty() + '\n')
        f.close()

        current_time = time.time()
        runmins = (current_time - start_time) / 60
        print(' Elapsed time: {:.2f}m'.format(runmins))
Exemplo n.º 5
0
    def vocab_init(fname, verbose=True):

        unigrams_freq = defaultdict(int)
        bigrams_freq = defaultdict(int)
        label_freq = defaultdict(int)

        sentences = SegSentence.load_sentence_file(fname)
        for i, sentence in enumerate(sentences):
            pre_unigram = Corpus.START
            for (unigram, label) in sentence:
                unigrams_freq[unigram] += 1
                bigrams_freq[pre_unigram + unigram] += 1
                bigrams_freq[unigram + pre_unigram] += 1
                pre_unigram = unigram
                label_freq[label] += 1
            bigrams_freq[pre_unigram + Corpus.STOP] += 1
            bigrams_freq[Corpus.STOP + pre_unigram] += 1

        bigrams = [Corpus.UNK] + sorted(bigrams_freq)
        unigrams = [
            Corpus.START,
            Corpus.STOP,
            Corpus.UNK,
        ] + sorted(unigrams_freq)

        bdict = OrderedDict((b, i) for (i, b) in enumerate(bigrams))
        udict = OrderedDict((u, i) for (i, u) in enumerate(unigrams))

        labels = sorted(label_freq)
        ldict = OrderedDict((l, i) for (i, l) in enumerate(labels))

        return {
            'origin_sentences': sentences,
            'bdict': bdict,
            'bigrams_freq': bigrams_freq,
            'unigrams_freq': unigrams_freq,
            'udict': udict,
            'ldict': ldict,
            'bigrams': bigrams,
            'unigrams': unigrams,
        }
Exemplo n.º 6
0
    else:
        print('Must specify either --vocab_file or --traing-data.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.model is None:
        print('Must specify --model or (or --write-vocab) parameter.')
        print('    (Use -h or --help flag for full option list.)')
        sys.exit()

    if args.test is not None:
        from seg_sentence import SegSentence
        from network import Network
        from Segmenter import Segmenter

        test_sentence = SegSentence.load_sentence_file(args.test)
        print('Loaded test trees from {}'.format(args.test))
        network = Network.load(args.model)
        print('Loaded model from: {}'.format(args.model))
        accuracy = Segmenter.evaluate_corpus(test_sentence,fm,network)
        #Segmenter.write_predicted('data/predicted',test_sentence,fm,network)
        print('Accuracy: {}'.format(accuracy))

    elif args.train is not None:
        from network import Network
        if args.np_seed is not None:
            import numpy as np
            np.random.seed(args.np_seed)


        network = Network.train(
Exemplo n.º 7
0
 def wrap_result(self):
     return SegSentence([('S', 'NA')] + [(c, predict_l) for (
         (c, gold_l), predict_l) in zip(self.gold_sentence, self.labels)] +
                        [('/S', 'NA')])
Exemplo n.º 8
0
    def train(corpus, bigrams_dims, unigrams_dims, lstm_units, hidden_units,
              epochs, batch_size, train_data_file, dev_data_file,
              model_save_file, droprate, unk_params, alpha, beta):

        start_time = time.time()

        fm = corpus
        bigrams_size = corpus.total_bigrams()
        unigrams_size = corpus.total_unigrams()

        network = Network(
            bigrams_size=bigrams_size,
            unigrams_size=unigrams_size,
            bigrams_dims=bigrams_dims,
            unigrams_dims=unigrams_dims,
            lstm_units=lstm_units,
            hidden_units=hidden_units,
            label_size=fm.total_labels(),
            span_nums=fm.total_span_nums(),
            droprate=droprate,
        )

        network.init_params()

        print('Hidden units : {} ,per LSTM units : {}'.format(
            hidden_units,
            lstm_units,
        ))

        print('Embeddings: bigrams = {}, unigrams = {}'.format(
            (bigrams_size, bigrams_dims), (unigrams_size, unigrams_dims)))

        print('Dropout rate : {}'.format(droprate))
        print('Parameters initialized in [-0.01,0.01]')
        print('Random UNKing parameter z = {}'.format(unk_params))

        training_data = corpus.gold_data_from_file(train_data_file)
        num_batched = -(-len(training_data) // batch_size)
        print('Loaded {} training sentences ({} batches of size {})!'.format(
            len(training_data),
            num_batched,
            batch_size,
        ))

        parse_every = -(-num_batched // 4)

        dev_sentences = SegSentence.load_sentence_file(dev_data_file)
        print('Loaded {} validation sentences!'.format(len(dev_sentences)))

        best_acc = FScore()
        for epoch in xrange(1, epochs + 1):
            print('............ epoch {} ............'.format(epoch))

            total_cost = 0.0
            total_states = 0
            training_acc = FScore()

            np.random.shuffle(training_data)

            for b in xrange(num_batched):
                batch = training_data[(b * batch_size):(b + 1) * batch_size]

                explore = [
                    Segmenter.exploration(example,
                                          fm,
                                          network,
                                          alpha=alpha,
                                          beta=beta) for example in batch
                ]
                for (_, acc) in explore:
                    training_acc += acc

                batch = [example for (example, _) in explore]

                dynet.renew_cg()
                network.prep_params()

                errors = []
                for example in batch:
                    ## random UNKing ##
                    for (i, uni) in enumerate(example['unigrams']):
                        if uni <= 2:
                            continue

                        u_freq = fm.unigrams_freq_list[uni]
                        drop_prob = unk_params / (unk_params + u_freq)
                        r = np.random.random()
                        if r < drop_prob:
                            example['unigrams'][i] = 0

                    for (i, bi) in enumerate(example['fwd_bigrams']):
                        if bi <= 2:
                            continue

                        b_freq = fm.bigrams_freq_list[bi]
                        drop_prob = unk_params / (unk_params + b_freq)
                        r = np.random.random()
                        if r < drop_prob:
                            example['fwd_bigrams'][i] = 0

                    fwd, back = network.evaluate_recurrent(
                        example['fwd_bigrams'],
                        example['unigrams'],
                    )

                    for (left,
                         right), correct in example['label_data'].items():
                        # correct = example['label_data'][(left,right)]
                        scores = network.evaluate_labels(
                            fwd, back, left, right)

                        probs = dynet.softmax(scores)
                        loss = -dynet.log(dynet.pick(probs, correct))
                        errors.append(loss)
                    total_states += len(example['label_data'])

                batch_error = dynet.esum(errors)
                total_cost += batch_error.scalar_value()
                batch_error.backward()
                network.trainer.update()

                mean_cost = total_cost / total_states

                print(
                    '\rBatch {}  Mean Cost {:.4f}  [Train: {}]'.format(
                        b,
                        mean_cost,
                        training_acc,
                    ),
                    end='',
                )
                sys.stdout.flush()

                if ((b + 1) % parse_every) == 0 or b == (num_batched - 1):
                    dev_acc = Segmenter.evaluate_corpus(
                        dev_sentences,
                        fm,
                        network,
                    )
                    print(' [Val: {}]'.format(dev_acc))

                    if dev_acc.fscore() > best_acc.fscore():
                        best_acc = dev_acc
                        network.save(model_save_file)
                        print('    [saved model : {}]'.format(model_save_file))

            current_time = time.time()
            runmins = (current_time - start_time) / 60
            print(' Elapsed time: {:.2f}m'.format(runmins))

        return network