Exemplo n.º 1
0
            return list(self._read(texts))

    def _read(self, texts):
        self._nb_examples = 0
        if not isinstance(texts, (list, tuple)):
            texts = [texts]
        for text in texts:
            self._nb_examples += 1
            yield self.make_torchtext_example(text)

    def make_torchtext_example(self, text):
        ex = {'words': text}
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    raw_texts = [
        'Lorem ipsum dolor sit amet, consectetur adipisicing elit',
        'tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim',
        'quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea',
        'consequat. Duis aute irure dolor in reprehenderit in voluptate velit',
        'cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat',
        'proident, sunt in culpa qui officia deserunt mollit anim id est.'
    ]
    quick_test(
        TextCorpus,
        raw_texts,
        lazy=True,
    )
Exemplo n.º 2
0
                    'words': ' '.join(subtree.leaves()),
                    'target': self._get_label(subtree.label())
                }
                if 'target' not in self.fields_dict.keys():
                    del ex['target']
                assert ex.keys() == self.fields_dict.keys()
                examples.append(
                    torchtext.data.Example.fromdict(ex, self.fields_dict))
            return examples
        else:
            ex = {
                'words': ' '.join(tree.leaves()),
                'target': self._get_label(tree.label())
            }
            if 'target' not in self.fields_dict.keys():
                del ex['target']
            assert ex.keys() == self.fields_dict.keys()
            return [torchtext.data.Example.fromdict(ex, self.fields_dict)]

    def _get_label(self, label):
        return self.granularity_map[label]


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(SSTCorpus,
               '../../../data/corpus/sst/train.txt',
               lazy=True,
               subtrees=False,
               granularity='2')
Exemplo n.º 3
0
        self.corpus_path = str(new_file_path)
        self.open(self.corpus_path)
        if self.lazy is True:
            return self
        else:
            return list(self)

    def _read(self, file):
        for line in file:
            line = line.strip().split()
            if line:
                label = line[0]
                text = ' '.join(line[1:])
                yield self.make_torchtext_example(text, label)

    def make_torchtext_example(self, text, label=None):
        ex = {'words': text, 'target': label}
        if 'target' not in self.fields_dict.keys():
            del ex['target']
        assert ex.keys() == self.fields_dict.keys()
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        IMDBCorpus,
        '../../../data/corpus/imdb/test/',
        lazy=True,
    )
Exemplo n.º 4
0
    def create_fields_tuples():
        tokenizer = nltk.WordPunctTokenizer()
        fields_tuples = [('words',
                          fields.WordsField(tokenize=tokenizer.tokenize)),
                         ('target', fields.TagsField())]
        return fields_tuples

    def _read(self, file):
        for line in file:
            data = json.loads(line.strip())
            text = data['text']
            label = str(int(data['stars']))
            example = self.make_torchtext_example(text, label)
            yield example

    def make_torchtext_example(self, text, label=None):
        ex = {'words': text, 'target': label}
        if 'target' not in self.fields_dict.keys():
            del ex['target']
        assert ex.keys() == self.fields_dict.keys()
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        YelpCorpus,
        '../../../data/corpus/yelp/review_train.json',
        lazy=True,
    )
Exemplo n.º 5
0
class TTSBRCorpus(Corpus):
    @staticmethod
    def create_fields_tuples():
        fields_tuples = [('words', fields.WordsField()),
                         ('target', fields.TagsField())]
        return fields_tuples

    def _read(self, file):
        for line in file:
            line = line.strip().split()
            label = line[0]
            text = ' '.join(line[2:])
            yield self.make_torchtext_example(text, label)

    def make_torchtext_example(self, text, label=None):
        ex = {'words': text, 'target': label}
        if 'target' not in self.fields_dict.keys():
            del ex['target']
        assert ex.keys() == self.fields_dict.keys()
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        TTSBRCorpus,
        '../../../data/corpus/ttsbr/trainTT.txt',
        lazy=True,
    )
Exemplo n.º 6
0
        return fields_tuples

    def _read(self, file):
        root = ElementTree.parse(file).getroot()
        categories = [x.text for x in root.iter('category')]
        descriptions = [x.text for x in root.iter('description')]
        for text, label in zip(descriptions, categories):
            if text is None or label is None:
                continue
            # business vs world (binary classification)
            if label not in ['Business', 'World']:
                continue
            text = re.sub("\\\\", "", text)  # fix escape
            yield self.make_torchtext_example(text, label)

    def make_torchtext_example(self, text, label=None):
        ex = {'words': text, 'target': label}
        if 'target' not in self.fields_dict.keys():
            del ex['target']
        assert ex.keys() == self.fields_dict.keys()
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        AGNewsCorpus,
        '../../../data/corpus/agnews/test.xml',
        lazy=False,
    )
Exemplo n.º 7
0
from spec.dataset.corpora.snli import SNLICorpus


class MNLICorpus(SNLICorpus):
    pass


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        MNLICorpus,
        '../../../data/corpus/mnli/multinli_1.0_dev_matched.jsonl',
        lazy=True,
    )
Exemplo n.º 8
0
                         ('target', fields.TagsField())]
        return fields_tuples

    def _read(self, file):
        for line in file:
            data = json.loads(line)
            label = data['gold_label']
            premise = data['sentence1']
            hypothesis = data['sentence2']
            if label == '-':
                # These were cases where the annotators disagreed; we'll just
                # skip them. It's like 800 / 500k examples in the train data
                continue
            yield self.make_torchtext_example(premise, hypothesis, label)

    def make_torchtext_example(self, prem, hyp, label):
        ex = {'words': prem, 'words_hyp': hyp, 'target': label}
        if 'target' not in self.fields_dict.keys():
            del ex['target']
        assert ex.keys() == self.fields_dict.keys()
        return torchtext.data.Example.fromdict(ex, self.fields_dict)


if __name__ == '__main__':
    from spec.dataset.corpora.test_corpus import quick_test
    quick_test(
        SNLICorpus,
        '../../../data/corpus/snli/snli_1.0_test.jsonl',
        lazy=True,
    )