Python download_compressed_directory 예제들, torchnlp.utils.download_compressed_directory Python 예제들

예제 #1

0

파일 보기

def imdb_dataset(directory='data/',
                 train=False,
                 test=False,
                 train_directory='train',
                 test_directory='test',
                 extracted_name='aclImdb',
                 check_file='aclImdb/README',
                 url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
                 sentiments=['pos', 'neg']):
    """
    Load the IMDB dataset (Large Movie Review Dataset v1.0).

    This is a dataset for binary sentiment classification containing substantially more data than
    previous benchmark datasets. Provided a set of 25,000 highly polar movie reviews for
    training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text
    and already processed bag of words formats are provided.

    **Reference:** http://ai.stanford.edu/~amaas/data/sentiment/

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_directory (str, optional): The directory of the training split.
        test_directory (str, optional): The directory of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_file (str, optional): Check this file exists if the download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.
        sentiments (list of str, optional): Sentiments to load from the dataset.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset and
        test dataset in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import imdb_dataset
        >>> train = imdb_dataset(train=True)
        >>> train[0:2]
        [{
          'text': 'For a movie that gets no respect there sure are a lot of memorable quotes...',
          'sentiment': 'pos'
        }, {
          'text': 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines...',
          'sentiment': 'pos'
        }]
    """
    download_compressed_directory(file_url=url, directory=directory, check_file=check_file)

    ret = []
    splits = [
        dir_ for (requested, dir_) in [(train, train_directory), (test, test_directory)]
        if requested
    ]
    for split_directory in splits:
        full_path = os.path.join(directory, extracted_name, split_directory)
        examples = []
        for sentiment in sentiments:
            for filename in glob.iglob(os.path.join(full_path, sentiment, '*.txt')):
                with open(filename, 'r', encoding="utf-8") as f:
                    text = f.readline()
                examples.append({
                    'text': text,
                    'sentiment': sentiment,
                })
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #2

0

파일 보기

파일: pretrained_word_vectors.py 프로젝트: ranarag/PyTorch-NLP

    def cache(self, name, cache, url=None):
        print(name)
        if os.path.isfile(name):
            path = name
            path_pt = os.path.join(cache, os.path.basename(name)) + '.pt'
        else:
            path = os.path.join(cache, name)
            path_pt = path + '.pt'

        if not os.path.isfile(path_pt) or self.is_include is not None:
            if url:
                download_compressed_directory(url, cache, name)

            if not os.path.isfile(path):
                raise RuntimeError('no vectors found at {}'.format(path))

            # str call is necessary for Python 2/3 compatibility, since
            # argument must be Python 2 str (Python 3 bytes) or
            # Python 3 str (Python 2 unicode)
            itos, vectors, dim = [], array.array(str('d')), None

            # Try to read the whole file with utf-8 encoding.
            binary_lines = False
            try:
                with io.open(path, encoding="utf8") as f:
                    lines = [line for line in f]
            # If there are malformed lines, read in binary mode
            # and manually decode each word from utf-8
            except:
                logger.warning("Could not read {} as UTF8 file, "
                               "reading file as bytes and skipping "
                               "words with malformed UTF8.".format(path))
                with open(path, 'rb') as f:
                    lines = [line for line in f]
                binary_lines = True

            logger.info("Loading vectors from {}".format(path))
            for line in tqdm(lines, total=len(lines)):
                # Explicitly splitting on " " is important, so we don't
                # get rid of Unicode non-breaking spaces in the vectors.
                entries = line.rstrip().split(b" " if binary_lines else " ")

                word, entries = entries[0], entries[1:]
                if dim is None and len(entries) > 1:
                    dim = len(entries)
                elif len(entries) == 1:
                    logger.warning("Skipping token {} with 1-dimensional "
                                   "vector {}; likely a header".format(
                                       word, entries))
                    continue
                elif dim != len(entries):
                    raise RuntimeError(
                        "Vector for token {} has {} dimensions, but previously "
                        "read vectors have {} dimensions. All vectors must have "
                        "the same number of dimensions.".format(
                            word, len(entries), dim))

                if binary_lines:
                    try:
                        if isinstance(word, six.binary_type):
                            word = word.decode('utf-8')
                    except:
                        logger.info("Skipping non-UTF8 token {}".format(
                            repr(word)))
                        continue

                if self.is_include is not None and not self.is_include(word):
                    continue

                vectors.extend(float(x) for x in entries)
                itos.append(word)

            self.itos = itos
            self.stoi = {word: i for i, word in enumerate(itos)}
            self.vectors = torch.Tensor(vectors).view(-1, dim)
            self.dim = dim
            logger.info('Saving vectors to {}'.format(path_pt))
            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
        else:
            logger.info('Loading vectors from {}'.format(path_pt))
            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)

예제 #3

0

파일 보기

파일: wmt.py 프로젝트: yalechang/PyTorch-NLP

def wmt_dataset(
    directory='data/wmt16_en_de',
    train=False,
    dev=False,
    test=False,
    train_filename='train.tok.clean.bpe.32000',
    dev_filename='newstest2013.tok.bpe.32000',
    test_filename='newstest2014.tok.bpe.32000',
    check_file='train.tok.clean.bpe.32000.en',
    url='https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8'
):
    """
    The Workshop on Machine Translation (WMT) 2014 English-German dataset.

    Initially this dataset was preprocessed by Google Brain. Though this download contains test sets
    from 2015 and 2016, the train set differs slightly from WMT 2015 and 2016 and significantly from
    WMT 2017.

    The provided data is mainly taken from version 7 of the Europarl corpus, which is freely
    available. Note that this the same data as last year, since Europarl is not anymore translted
    across all 23 official European languages. Additional training data is taken from the new News
    Commentary corpus. There are about 50 million words of training data per language from the
    Europarl corpus and 3 million words from the News Commentary corpus.

    A new data resource from 2013 is the Common Crawl corpus which was collected from web sources.
    Each parallel corpus comes with a annotation file that gives the source of each sentence pair.

    References:
        * https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/translate_ende.py # noqa: E501
        * http://www.statmt.org/wmt14/translation-task.html

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the dev split.
        test_filename (str, optional): The filename of the test split.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import wmt_dataset
        >>> train = wmt_dataset(train=True)
        >>> train[:2]
        [{
          'en': 'Res@@ um@@ ption of the session',
          'de': 'Wiederaufnahme der Sitzungsperiode'
        }, {
          'en': 'I declare resumed the session of the European Parliament ad@@ jour@@ ned on...'
          'de': 'Ich erklär@@ e die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene...'
        }]
    """
    download_compressed_directory(file_url=url,
                                  directory=directory,
                                  check_file=check_file,
                                  filename='wmt16_en_de.tar.gz')

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]

    for filename in splits:
        examples = []

        en_path = os.path.join(directory, filename + '.en')
        de_path = os.path.join(directory, filename + '.de')
        en_file = [l.strip() for l in open(en_path, 'r', encoding='utf-8')]
        de_file = [l.strip() for l in open(de_path, 'r', encoding='utf-8')]
        assert len(en_file) == len(de_file)
        for i in range(len(en_file)):
            if en_file[i] != '' and de_file[i] != '':
                examples.append({'en': en_file[i], 'de': de_file[i]})

        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #4

0

파일 보기

파일: smt.py 프로젝트: yalechang/PyTorch-NLP

def smt_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        train_filename='train.txt',
        dev_filename='dev.txt',
        test_filename='test.txt',
        extracted_name='trees',
        check_file='trees/train.txt',
        url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
        fine_grained=False,
        subtrees=False):
    """
    Load the Stanford Sentiment Treebank dataset.

    Semantic word spaces have been very useful but cannot express the meaning of longer phrases in
    a principled way. Further progress towards understanding compositionality in tasks such as
    sentiment detection requires richer supervised training and evaluation resources and more
    powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes
    fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and
    presents new challenges for sentiment compositionality.

    **Reference**:
    https://nlp.stanford.edu/sentiment/index.html

    **Citation:**
    Richard Socher, Alex Perelygin, Jean Y. Wu, Jason Chuang, Christopher D. Manning,
    Andrew Y. Ng and Christopher Potts. Recursive Deep Models for Semantic Compositionality Over a
    Sentiment Treebank

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.
        subtrees (bool, optional): Whether to include sentiment-tagged subphrases in addition to
            complete examples.
        fine_grained (bool, optional): Whether to use 5-class instead of 3-class labeling.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import smt_dataset
        >>> train = smt_dataset(train=True)
        >>> train[5]
        {
          'text': "Whether or not you 're enlightened by any of Derrida 's lectures on ...",
          'label': 'positive'
        }
    """
    download_compressed_directory(file_url=url,
                                  directory=directory,
                                  check_file=check_file)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if subtrees:
                    examples.extend(parse_tree(line, subtrees=subtrees))
                else:
                    examples.append(parse_tree(line, subtrees=subtrees))
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #5

0

파일 보기

파일: wikitext_2.py 프로젝트: yalechang/PyTorch-NLP

def wikitext_2_dataset(
    directory='data/',
    train=False,
    dev=False,
    test=False,
    train_filename='wiki.train.tokens',
    dev_filename='wiki.valid.tokens',
    test_filename='wiki.test.tokens',
    extracted_name='wikitext-2',
    check_file='wikitext-2/wiki.train.tokens',
    url='https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
):
    """
    Load the WikiText-2 dataset.

    The WikiText language modeling dataset is a collection of over 100 million tokens extracted
    from the set of verified Good and Featured articles on Wikipedia. The dataset is available
    under the Creative Commons Attribution-ShareAlike License.

    **Reference:**
    https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`list` of :class:`str`: Tuple with the training tokens, dev tokens
        and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import wikitext_2_dataset
        >>> train = wikitext_2_dataset(train=True)
        >>> train[:10]
        ['</s>', '=', 'Valkyria', 'Chronicles', 'III', '=', '</s>', '</s>', 'Senjō', 'no']
    """
    download_compressed_directory(file_url=url,
                                  directory=directory,
                                  check_file=check_file)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        text = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                text.extend(line.replace('<unk>', UNKNOWN_TOKEN).split())
                text.append(EOS_TOKEN)
        ret.append(text)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #6

0

파일 보기

파일: snli.py 프로젝트: yalechang/PyTorch-NLP

def snli_dataset(directory='data/',
                 train=False,
                 dev=False,
                 test=False,
                 train_filename='snli_1.0_train.jsonl',
                 dev_filename='snli_1.0_dev.jsonl',
                 test_filename='snli_1.0_test.jsonl',
                 extracted_name='snli_1.0',
                 check_file='snli_1.0/snli_1.0_train.jsonl',
                 url='http://nlp.stanford.edu/projects/snli/snli_1.0.zip'):
    """
    Load the Stanford Natural Language Inference (SNLI) dataset.

    The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs
    manually labeled for balanced classification with the labels entailment, contradiction, and
    neutral, supporting the task of natural language inference (NLI), also known as recognizing
    textual entailment (RTE). We aim for it to serve both as a benchmark for evaluating
    representational systems for text, especially including those induced by representation
    learning methods, as well as a resource for developing NLP models of any kind.

    **Reference:** https://nlp.stanford.edu/projects/snli/

    **Citation:**
    Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large
    annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference
    on Empirical Methods in Natural Language Processing (EMNLP).

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import snli_dataset
        >>> train = snli_dataset(train=True)
        >>> train[0]
        {
          'premise': 'Kids are on a amusement ride.',
          'hypothesis': 'A car is broke down on the side of the road.',
          'label': 'contradiction',
          'premise_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
          'hypothesis_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
        }
    """
    download_compressed_directory(file_url=url,
                                  directory=directory,
                                  check_file=check_file)

    get_transitions = lambda parse: [
        'reduce' if t == ')' else 'shift' for t in parse if t != '('
    ]
    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                line = json.loads(line)
                examples.append({
                    'premise':
                    line['sentence1'],
                    'hypothesis':
                    line['sentence2'],
                    'label':
                    line['gold_label'],
                    'premise_transitions':
                    get_transitions(line['sentence1_binary_parse']),
                    'hypothesis_transitions':
                    get_transitions(line['sentence2_binary_parse'])
                })
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #7

0

파일 보기

def iwslt_dataset(
        directory='data/iwslt/',
        train=False,
        dev=False,
        test=False,
        language_extensions=['en', 'de'],
        train_filename='{source}-{target}/train.{source}-{target}.{lang}',
        dev_filename='{source}-{target}/IWSLT16.TED.tst2013.{source}-{target}.{lang}',
        test_filename='{source}-{target}/IWSLT16.TED.tst2014.{source}-{target}.{lang}',
        check_file='{source}-{target}/train.tags.{source}-{target}.{source}',
        url='https://wit3.fbk.eu/archive/2016-01/texts/{source}/{target}/{source}-{target}.tgz'):
    """
    Load the International Workshop on Spoken Language Translation (IWSLT) 2017 translation dataset.

    In-domain training, development and evaluation sets were supplied through the website of the
    WIT3 project, while out-of-domain training data were linked in the workshop’s website. With
    respect to edition 2016 of the evaluation campaign, some of the talks added to the TED
    repository during the last year have been used to define the evaluation sets (tst2017), while
    the remaining new talks have been included in the training sets.

    The English data that participants were asked to recognize and translate consists in part of
    TED talks as in the years before, and in part of real-life lectures and talks that have been
    mainly recorded in lecture halls at KIT and Carnegie Mellon University. TED talks are
    challenging due to their variety in topics, but are very benign as they are very thoroughly
    rehearsed and planned, leading to easy to recognize and translate language.

    References:
      * http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf
      * http://workshop2017.iwslt.org/

    **Citation:**
    M. Cettolo, C. Girardi, and M. Federico. 2012. WIT3: Web Inventory of Transcribed and Translated
    Talks. In Proc. of EAMT, pp. 261-268, Trento, Italy.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the dev split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        language_extensions (:class:`list` of :class:`str`): Two language extensions
            ['en'|'de'|'it'|'ni'|'ro'] to load.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the dev split.
        test_filename (str, optional): The filename of the test split.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import iwslt_dataset
        >>> train = iwslt_dataset(train=True)
        >>> train[:2]
        [{
          'en': "David Gallo: This is Bill Lange. I'm Dave Gallo.",
          'de': 'David Gallo: Das ist Bill Lange. Ich bin Dave Gallo.'
        }, {
          'en': "And we're going to tell you some stories from the sea here in video.",
          'de': 'Wir werden Ihnen einige Geschichten über das Meer in Videoform erzählen.'
        }]
    """
    if len(language_extensions) != 2:
        raise ValueError("`language_extensions` must be two language extensions "
                         "['en'|'de'|'it'|'ni'|'ro'] to load.")

    # Format Filenames
    source, target = tuple(language_extensions)
    check_file = check_file.format(source=source, target=target)
    url = url.format(source=source, target=target)

    download_compressed_directory(file_url=url, directory=directory, check_file=check_file)

    iwslt_clean(os.path.join(directory, '{source}-{target}'.format(source=source, target=target)))

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        examples = []
        for extension in language_extensions:
            path = os.path.join(directory,
                                filename.format(lang=extension, source=source, target=target))
            with open(path, 'r', encoding='utf-8') as f:
                language_specific_examples = [l.strip() for l in f]

            if len(examples) == 0:
                examples = [{} for _ in range(len(language_specific_examples))]
            for i, example in enumerate(language_specific_examples):
                examples[i][extension] = example

        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

예제 #8

0

파일 보기

def ud_pos_dataset(
        directory='data/',
        train=False,
        dev=False,
        test=False,
        train_filename='en-ud-tag.v2.train.txt',
        dev_filename='en-ud-tag.v2.dev.txt',
        test_filename='en-ud-tag.v2.test.txt',
        extracted_name='en-ud-v2',
        check_file='en-ud-v2/en-ud-tag.v2.train.txt',
        url='https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'):
    """
    Load the Universal Dependencies - English Dependency Treebank dataset.

    Corpus of sentences annotated using Universal Dependencies annotation. The corpus comprises
    254,830 words and 16,622 sentences, taken from various web media including weblogs, newsgroups,
    emails, reviews, and Yahoo! answers.

    References:
        * http://universaldependencies.org/
        * https://github.com/UniversalDependencies/UD_English

    **Citation:**
    Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman and
    Miriam Connor and John Bauer and Christopher D. Manning (2014).
    A Gold Standard Dependency Corpus for {E}nglish

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_file (str, optional): Check this file exists if download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
        tokens and test tokens in order if their respective boolean argument is true.

    Example:
        >>> from torchnlp.datasets import ud_pos_dataset
        >>> train = ud_pos_dataset(train=True)
        >>> train[17] # Sentence at index 17 is shortish
        {
          'tokens': ['Guerrillas', 'killed', 'an', 'engineer', ',', 'Asi', 'Ali', ',', 'from',
                     'Tikrit', '.'],
          'ud_tags': ['NOUN', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP',
                      'PROPN', 'PUNCT'],
          'ptb_tags': ['NNS', 'VBD', 'DT', 'NN', ',', 'NNP', 'NNP', ',', 'IN', 'NNP', '.']
        }
    """
    download_compressed_directory(file_url=url,
                                  directory=directory,
                                  check_file=check_file)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename),
              (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
            for line in f:
                line = line.strip()
                if line == '' and len(sentence['tokens']) > 0:
                    examples.append(sentence)
                    sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
                elif line != '':
                    token, ud_tag, ptb_tag = tuple(line.split('\t'))
                    sentence['tokens'].append(token)
                    sentence['ud_tags'].append(ud_tag)
                    sentence['ptb_tags'].append(ptb_tag)
        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)