Exemplo n.º 1
0
    def build_corpus(self):
        print(f'Loading training trees from `{self.train_path}`...')
        if self.multitask == 'ccg':
            train_treebank = ccg.fromfile(self.train_path)
        else:
            with open(self.train_path) as f:
                train_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading development trees from `{self.dev_path}`...')
        with open(self.dev_path) as f:
            dev_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading test trees from `{self.test_path}`...')
        with open(self.test_path) as f:
            test_treebank = [fromstring(line.strip()) for line in f]

        if self.multitask == 'spans':
            # need trees with span-information
            train_treebank = [tree.convert() for tree in train_treebank]
            dev_treebank = [tree.convert() for tree in dev_treebank]
            test_treebank = [tree.convert() for tree in test_treebank]

        print("Constructing vocabularies...")
        if self.vocab_path is not None:
            print(f'Using word vocabulary specified in `{self.vocab_path}`')
            with open(self.vocab_path) as f:
                vocab = json.load(f)
            words = [word for word, count in vocab.items() for _ in range(count)]
        else:
            words = [word for tree in train_treebank for word in tree.words()]

        if self.multitask == 'none':
            labels = []
        else:
            labels = [label for tree in train_treebank for label in tree.labels()]

        if self.multitask == 'none':
            words = [UNK, START] + words
        else:
            words = [UNK, START, STOP] + words

        word_vocab = Vocabulary.fromlist(words, unk_value=UNK)
        label_vocab = Vocabulary.fromlist(labels)

        self.word_vocab = word_vocab
        self.label_vocab = label_vocab

        self.train_treebank = train_treebank
        self.dev_treebank = dev_treebank
        self.test_treebank = test_treebank

        print('\n'.join((
            'Corpus statistics:',
            f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals',
            f'Train: {len(train_treebank):,} sentences',
            f'Dev: {len(dev_treebank):,} sentences',
            f'Test: {len(test_treebank):,} sentences')))
Exemplo n.º 2
0
    def build_corpus(self):
        print(f'Loading training trees from `{self.train_path}`...')
        with open(self.train_path) as f:
            train_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading development trees from `{self.dev_path}`...')
        with open(self.dev_path) as f:
            dev_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading test trees from `{self.test_path}`...')
        with open(self.test_path) as f:
            test_treebank = [fromstring(line.strip()) for line in f]

        if self.unlabeled:
            print(f'Converting trees to unlabeled form...')
            for tree in train_treebank:
                tree.unlabelize()

        if self.model_type == 'crf':
            print(f'Converting trees to CNF...')
            train_treebank = [tree.cnf() for tree in train_treebank]

            if self.unlabeled:
                for tree in train_treebank:
                    tree.remove_chains()

        print("Constructing vocabularies...")
        if self.vocab_path is not None:
            print(f'Using word vocabulary specified in `{self.vocab_path}`')
            with open(self.vocab_path) as f:
                vocab = json.load(f)
            words = [
                word for word, count in vocab.items() for _ in range(count)
            ]
        else:
            words = [word for tree in train_treebank for word in tree.words()]

        if self.max_sent_len > 0:
            filtered_treebank = [
                tree for tree in train_treebank
                if len(tree.words()) <= self.max_sent_len
            ]

            print(
                "Using sentences with length <= {}: {:.1%} of all training trees."
                .format(self.max_sent_len,
                        len(filtered_treebank) / len(train_treebank)))

            train_treebank = filtered_treebank

        if self.min_label_count > 1:
            counted_labels = Counter(
                [label for tree in train_treebank for label in tree.labels()])
            filtered_labels = [
                label for label, count in counted_labels.most_common()
                if count >= self.min_label_count
            ]
            filtered_treebank = [
                tree for tree in train_treebank
                if all(label in filtered_labels for label in tree.labels())
            ]

            print(
                "Using labels with count >= {}: {}/{} ({:.1%}) of all labels and {:.1%} of all training trees."
                .format(self.min_label_count, len(filtered_labels),
                        len(counted_labels),
                        len(filtered_labels) / len(counted_labels),
                        len(filtered_treebank) / len(train_treebank)))

            train_treebank = filtered_treebank

        labels = [label for tree in train_treebank for label in tree.labels()]

        if self.model_type == 'crf':
            words = [UNK, START, STOP] + words
        else:
            words = [UNK] + words

        word_vocab = Vocabulary.fromlist(words, unk_value=UNK)
        label_vocab = Vocabulary.fromlist(labels)

        ##
        # counted_labels = Counter(label_vocab.counts).most_common()
        # pprint(counted_labels)
        ##

        if self.model_type.endswith('rnng'):
            # Order is very important! See DiscParser/GenParser classes to know why.
            if self.model_type == 'disc-rnng':
                actions = [SHIFT, REDUCE
                           ] + [NT(label) for label in label_vocab]
            elif self.model_type == 'gen-rnng':
                actions = [REDUCE] + [NT(label) for label in label_vocab
                                      ] + [GEN(word) for word in word_vocab]
            action_vocab = Vocabulary()
            for action in actions:
                action_vocab.add(action)
        else:
            action_vocab = Vocabulary()

        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        self.action_vocab = action_vocab

        self.train_treebank = train_treebank
        self.dev_treebank = dev_treebank
        self.test_treebank = test_treebank

        print('\n'.join((
            'Corpus statistics:',
            f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals, {action_vocab.size:,} actions',
            f'Train: {len(train_treebank):,} sentences',
            f'Dev: {len(dev_treebank):,} sentences',
            f'Test: {len(test_treebank):,} sentences')))