Пример #1
0
    def penn_treebank_dataset(self,
                              directory='data/penn-treebank',
                              train=False,
                              dev=False,
                              test=False,
                              train_filename='ptb.train.txt',
                              dev_filename='ptb.valid.txt',
                              test_filename='ptb.test.txt',
                              check_files=['ptb.train.txt'],
                              urls=[
                                  'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
                                  'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
                                  'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'
                              ]):

        download_files_maybe_extract(urls=urls, directory=directory, check_files=check_files)

        splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
        splits = [f for (requested, f) in splits if requested]
        for filename in splits:
            full_path = os.path.join(directory, filename)
            with io.open(full_path, encoding='utf-8') as f:
                tokens = 0
                for line in f:
                    words = line.split() + ['<eos>']
                    tokens += len(words)
                    for word in words:
                        self.dictionary.add_word(word)

            with io.open(full_path, encoding='utf-8') as f:
                ids = torch.LongTensor(tokens)
                token = 0
                for line in f:
                    words = line.split() + ['<eos>']
                    for word in words:
                        ids[token] = self.dictionary.word2idx[word]
                        token += 1

            return ids
Пример #2
0
def trec_dataset(directory='data/trec/',
                 train=False,
                 test=False,
                 train_filename='train_5500.label',
                 test_filename='TREC_10.label',
                 check_files=['train_5500.label'],
                 urls=[
                     'http://cogcomp.org/Data/QA/QC/train_5500.label',
                     'http://cogcomp.org/Data/QA/QC/TREC_10.label'
                 ],
                 fine_grained=False):
    """
    Load the Text REtrieval Conference (TREC) Question Classification dataset.

    TREC dataset contains 5500 labeled questions in training set and another 500 for test set. The
    dataset has 6 labels, 50 level-2 labels. Average length of each sentence is 10, vocabulary size
    of 8700.

    References:
        * https://nlp.stanford.edu/courses/cs224n/2004/may-steinberg-project.pdf
        * http://cogcomp.org/Data/QA/QC/
        * http://www.aclweb.org/anthology/C02-1150

    **Citation:**
    Xin Li, Dan Roth, Learning Question Classifiers. COLING'02, Aug., 2002.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        test_filename (str, optional): The filename of the test split.
        check_files (str, optional): Check if these files exist, then this download was successful.
        urls (str, optional): URLs to download.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import trec_dataset  # doctest: +SKIP
        >>> train = trec_dataset(train=True)  # doctest: +SKIP
        >>> train[:2]  # doctest: +SKIP
        [{
          'label': 'DESC',
          'text': 'How did serfdom develop in and then leave Russia ?'
        }, {
          'label': 'ENTY',
          'text': 'What films featured the character Popeye Doyle ?'
        }]
    """
    download_files_maybe_extract(urls=urls,
                                 directory=directory,
                                 check_files=check_files)

    ret = []
    splits = [(train, train_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, filename)
        examples = []
        for line in open(full_path, 'rb'):
            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
            label, _, text = line.replace(b'\xf0',
                                          b' ').strip().decode().partition(' ')
            label, _, label_fine = label.partition(':')
            if fine_grained:
                examples.append({'label': label_fine, 'text': text})
            else:
                examples.append({'label': label, 'text': text})
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Пример #3
0
def multi30k_dataset(
    directory='data/multi30k/',
    train=False,
    dev=False,
    test=False,
    train_filename='train',
    dev_filename='val',
    test_filename='test',
    check_files=['train.de', 'val.de'],
    urls=[
        'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz',
        'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz',
        'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz'
    ]):
    """
    Load the WMT 2016 machine translation dataset.

    As a translation task, this task consists in translating English sentences that describe an
    image into German, given the English sentence itself. As training and development data, we
    provide 29,000 and 1,014 triples respectively, each containing an English source sentence, its
    German human translation. As test data, we provide a new set of 1,000 tuples containing an
    English description.

    Status:
        Host ``www.quest.dcs.shef.ac.uk`` forgot to update their SSL
        certificate; therefore, this dataset does not download securely.

    References:
        * http://www.statmt.org/wmt16/multimodal-task.html
        * http://shannon.cs.illinois.edu/DenotationGraph/

    **Citation**
    ::

        @article{elliott-EtAl:2016:VL16,
            author    = {{Elliott}, D. and {Frank}, S. and {Sima'an}, K. and {Specia}, L.},
            title     = {Multi30K: Multilingual English-German Image Descriptions},
            booktitle = {Proceedings of the 5th Workshop on Vision and Language},
            year      = {2016},
            pages     = {70--74},
            year      = 2016
        }

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_directory (str, optional): The directory of the training split.
        dev_directory (str, optional): The directory of the dev split.
        test_directory (str, optional): The directory of the test split.
        check_files (str, optional): Check if these files exist, then this download was successful.
        urls (str, optional): URLs to download.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import multi30k_dataset  # doctest: +SKIP
        >>> train = multi30k_dataset(train=True)  # doctest: +SKIP
        >>> train[:2]  # doctest: +SKIP
        [{
          'en': 'Two young, White males are outside near many bushes.',
          'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'
        }, {
          'en': 'Several men in hard hatsare operating a giant pulley system.',
          'de': 'Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.'
        }]
    """
    download_files_maybe_extract(urls=urls,
                                 directory=directory,
                                 check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]

    for filename in splits:
        examples = []

        en_path = os.path.join(directory, filename + '.en')
        de_path = os.path.join(directory, filename + '.de')
        en_file = [l.strip() for l in open(en_path, 'r', encoding='utf-8')]
        de_file = [l.strip() for l in open(de_path, 'r', encoding='utf-8')]
        assert len(en_file) == len(de_file)
        for i in range(len(en_file)):
            if en_file[i] != '' and de_file[i] != '':
                examples.append({'en': en_file[i], 'de': de_file[i]})

        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Пример #4
0
def conll_dataset(directory='data/',
                  train=False,
                  dev=False,
                  test=False,
                  train_filename='train.txt',
                  dev_filename='dev.txt',
                  test_filename='test.txt',
                  check_files=None,
                  urls=None,
                  tag_scheme=None,
                  column_names=None,
                  use_cols=None):
    """
    Load a dataset in ConLL format.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the dev split.
        test_filename (str, optional): The filename of the test split.
        check_files (str, optional): Check if these files exist,
        then this download was successful.
        urls (str, optional): URLs to download.
        tag_scheme (str, optional): The tag scheme of the contained tags (IOB or IOBES).
        column_names (str, optional): The names of the columns contained in the dataset
        (defaults to ConLL2003 [text, pos, chunk, entity]).
        use_cols (int, optional): The columns to retain in the dataset (defaults to all).

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with training tokens,
        dev tokens and test tokens in order if their respective boolean argument is true.
    """

    urls = urls or []
    check_files = check_files or []

    download_files_maybe_extract(urls=urls,
                                 directory=directory,
                                 check_files=check_files)

    if tag_scheme and tag_scheme.lower() not in ['iob', 'iobes']:
        raise ValueError("Unknown tag scheme '%s'" % tag_scheme)

    column_names = column_names or ['text', 'pos', 'chunk', 'entity']
    use_cols = use_cols or list(range(len(column_names)))

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]

    for filename in splits:
        full_path = os.path.join(directory, filename)
        examples = []

        sentences = load_sentences(full_path)

        if tag_scheme:
            update_tag_scheme(sentences, tag_scheme)

        for sentence in sentences:
            columns = list(zip(*sentence))
            examples.append({
                column_names[col_idx]: list(columns[col_idx])
                for col_idx in use_cols
            })

        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Пример #5
0
def penn_treebank_dataset(
        directory='data/penn-treebank',
        train=False,
        dev=False,
        test=False,
        train_filename='ptb.train.txt',
        dev_filename='ptb.valid.txt',
        test_filename='ptb.test.txt',
        check_files=['ptb.train.txt'],
        urls=[
            'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
            'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
            'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'
        ],
        unknown_token=DEFAULT_UNKNOWN_TOKEN,
        eos_token=DEFAULT_EOS_TOKEN):
    """
    Load the Penn Treebank dataset.

    This is the Penn Treebank Project: Release 2 CDROM, featuring a million words of 1989 Wall
    Street Journal material.

    **Reference:** https://catalog.ldc.upenn.edu/ldc99t42

    **Citation:**
    Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993).
    Building a Large Annotated Corpus of English: The Penn Treebank

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        name (str, optional): Name of the dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        urls (str, optional): URLs to download.
        unknown_token (str, optional): Token to use for unknown words.
        eos_token (str, optional): Token to use at the end of sentences.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import penn_treebank_dataset  # doctest: +SKIP
        >>> train = penn_treebank_dataset(train=True)  # doctest: +SKIP
        >>> train[:10]  # doctest: +SKIP
        ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano',
        'guterman', 'hydro-quebec']
    """
    download_files_maybe_extract(urls=urls, directory=directory, check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, filename)
        text = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                text.extend(line.replace('<unk>', unknown_token).split())
                text.append(eos_token)
        ret.append(text)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)