def snli(split,
         vocab_file,
         sequence_length=75,
         batch_size=64,
         transform=utils.identity,
         filter_fn=None,
         data_dir=None):
    """Loads the SNLI dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    def _preprocess(d):
        """Applies tokenization."""
        hypothesis = tokenize(d['hypothesis']).flat_values
        premise = tokenize(d['premise']).flat_values
        sep = tokenize(SEP).flat_values
        tokens = tf.concat([hypothesis, sep, premise], axis=0)
        return transform({
            'inputs': tokens,
            'labels': d['label'],
            'index': tf.size(tokens),
        })

    # Load dataset.
    dset = load_tfds('snli', split, _preprocess, filter_fn, data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset, batch_size, sequence_length)

    return dset
def imdb(split,
         vocab_file,
         sequence_length=1000,
         batch_size=64,
         transform=utils.identity,
         filter_fn=None,
         data_dir=None):
    """Loads the imdb reviews dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    def _preprocess(d):
        """Applies tokenization."""
        tokens = tokenize(d['text']).flat_values
        preprocessed = {
            'inputs': tokens,
            'labels': d['label'],
            'index': tf.size(tokens),
        }
        return transform(preprocessed)

    # Load dataset.
    dset = load_tfds('imdb_reviews',
                     split,
                     _preprocess,
                     filter_fn,
                     data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset, batch_size, sequence_length)

    return dset
def ag_news(split,
            vocab_file,
            sequence_length=100,
            batch_size=64,
            transform_fn=utils.identity,
            filter_fn=None,
            data_dir=None):
    """Loads the ag news dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    def _preprocess(d):
        """Applies tokenization."""
        tokens = tokenize(
            d['description']).flat_values  # Note: we ignore 'title'
        preprocessed = {
            'inputs': tokens,
            'labels': d['label'],
            'index': tf.size(tokens),
        }
        return transform_fn(preprocessed)

    # Load dataset.
    dset = load_tfds('ag_news_subset',
                     split,
                     _preprocess,
                     filter_fn,
                     data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset, batch_size, sequence_length)

    return dset
示例#4
0
def paracrawl(language_pair,
              vocab_files,
              sequence_length,
              batch_size=64,
              transform_fn=utils.identity,
              filter_fn=None,
              data_dir=None):
    """Loads a paracrawl translation dataset from TFDS.

  Arguments:
    language_pair: str, e.g. 'ende', specifying both languages.
    vocab_files: List[str], vocab filenames for each language.
  """

    PARACRAWL_LANGUAGE_PAIRS = [
        'enbg', 'encs', 'enda', 'ende', 'enel', 'enes', 'enet', 'enfi', 'enfr',
        'enga', 'enhr', 'enhu', 'enit', 'enlt', 'enlv', 'enmt', 'ennl', 'enpl',
        'enpt', 'enro', 'ensk', 'ensl', 'ensv'
    ]

    if language_pair not in PARACRAWL_LANGUAGE_PAIRS:
        raise ValueError(
            f'language_pair must be one of {PARACRAWL_LANGUAGE_PAIRS}')
    languages = [language_pair[:2], language_pair[2:]]

    tokenizer_list = [
        tokenize_w_punctuation(load_tokenizer(f)) for f in vocab_files
    ]
    tokenizer_dict = dict(zip(languages, tokenizer_list))

    def _preprocess(d):
        tokens = {l: tokenizer_dict[l](d[l]).flat_values for l in languages}
        for l in languages:
            tokens.update({f'{l}_index': tf.size(tokens[l])})
            tokens.update({f'{l}_orig': d[l]})
        return transform_fn(tokens)

    dataset = tfds.load(
        f'para_crawl/{language_pair}',
        split='train',  # para_crawl only has a train split
        data_dir=data_dir)

    dset = pipeline(dataset, preprocess_fun=_preprocess, filter_fn=filter_fn)

    # Filter out examples longer than sequence length.
    for l in languages:
        dset = dset.filter(lambda d: d[f'{l}_index'] <= sequence_length)

    # We assume the dataset contains inputs, labels, and an index.
    padded_shapes = {}
    for l in languages:
        padded_shapes[f'{l}_index'] = ()
        padded_shapes[f'{l}_orig'] = ()
        padded_shapes[l] = (sequence_length, )

    # Pad remaining examples to the sequence length.
    dset = dset.padded_batch(batch_size, padded_shapes)

    return dset, tokenizer_dict
示例#5
0
def test_tokenizer_fun(vocab):
    """Tests the subword tokenizer."""
    tokenizer = load_tokenizer(vocab.name)
    tokenize = datasets.tokenize_fun(tokenizer)

    actual = list(tokenize("this is a test.").flat_values.numpy())
    expected = [5, 6, 2, 6, 3, 4, 7]
    assert actual == expected
def dbpedia(split,
            num_classes,
            vocab_file,
            sequence_length=1000,
            batch_size=64,
            transform=utils.identity,
            filter_fn=None,
            data_dir=None):
    """Loads the dpedia text classification dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    if data_dir is None:
        raise ValueError('DBPedia dataset requires data_dir to be provided.')

    def _preprocess(d):
        """Applies tokenization, and
    transforms the dbpedia labels according to the
    specified number of classes

    For a given number of classes, the classes with
    labels below that number are kept, and all other classes
    are removed.

    So, e.g., num_classes = 4, would keep classes 0,1,2,3"""
        def relabel(label):
            if label <= num_classes:
                # in DBPedia csv file, labels are
                # given as 1, 2, ...
                return label - 1
            else:
                return tf.constant(-1, dtype=tf.int64)

        tokens = tokenize(d['text']).flat_values
        preprocessed = {
            'inputs': tokens,
            'labels': relabel(d['label']),
            'index': tf.size(tokens),
        }

        return transform(preprocessed)

    filter_fn = lambda x: x['labels'] != -1

    # Load dataset.
    dset = load_csv('dbpedia',
                    split,
                    _preprocess,
                    filter_fn,
                    data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset, batch_size, sequence_length)

    return dset
def goemotions(split,
               vocab_file,
               sequence_length=50,
               batch_size=64,
               emotions=None,
               transform=utils.identity,
               filter_fn=None,
               data_dir=None):
    """Loads the goemotions dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    if emotions is None:  # Use all emotions.
        emotions = ('admiration', 'amusement', 'anger', 'annoyance',
                    'approval', 'caring', 'confusion', 'curiosity', 'desire',
                    'disappointment', 'disapproval', 'disgust',
                    'embarrassment', 'excitement', 'fear', 'gratitude',
                    'grief', 'joy', 'love', 'nervousness', 'neutral',
                    'optimism', 'pride', 'realization', 'relief', 'remorse',
                    'sadness', 'surprise')

    def _preprocess(d):
        tokens = tokenize(d['comment_text']).flat_values
        index = tf.size(tokens)
        labels = tf.convert_to_tensor([d[e] for e in emotions], dtype=tf.int64)
        preprocessed = {
            'inputs': tokens,
            'labels': labels,
            'index': index,
        }
        return transform(preprocessed)

    # Load dataset.
    dset = load_tfds('goemotions',
                     split,
                     _preprocess,
                     filter_fn,
                     data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset,
                        batch_size,
                        sequence_length,
                        label_shape=(len(emotions), ))

    return dset
def snli_sep(split,
             vocab_file,
             hypothesis_length=40,
             premise_length=40,
             batch_size=64,
             transform=utils.identity,
             filter_fn=None,
             data_dir=None):
    """Loads the SNLI dataset, with hypothesis and premise
     separated as two different fields """
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    def _preprocess(d):
        """Applies tokenization."""
        hypothesis = tokenize(d['hypothesis']).flat_values
        premise = tokenize(d['premise']).flat_values
        return transform({
            'hypothesis': hypothesis,
            'premise': premise,
            'hypothesis_index': tf.size(hypothesis),
            'premise_index': tf.size(premise),
            'labels': d['label'],
        })

    # Load dataset.
    dset = load_tfds('snli', split, _preprocess, filter_fn, data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    field_lengths = {
        'hypothesis_index': hypothesis_length,
        'premise_index': premise_length
    }
    padded_shapes = {
        'hypothesis': (hypothesis_length, ),
        'premise': (premise_length, ),
        'premise_index': (),
        'hypothesis_index': (),
        'labels': ()
    }
    dset = filter_pad_batch(dset, batch_size, field_lengths, padded_shapes)

    return dset
def amazon(split,
           num_classes,
           vocab_file,
           sequence_length=250,
           batch_size=64,
           transform=utils.identity,
           filter_fn=None,
           data_dir=None):
    """Loads the yelp reviews dataset."""
    tokenize = tokenize_fun(load_tokenizer(vocab_file))

    if data_dir is None:
        raise ValueError('Amazon dataset requires data_dir to be provided.')

    label_conversion = data_utils.sentiment_relabel(num_classes)

    def _preprocess(d):
        """Applies tokenization, and
    transforms the Amazon labels according to the
    specified number of classes"""

        tokens = tokenize(d['text']).flat_values
        preprocessed = {
            'inputs': tokens,
            'labels': label_conversion(d['label']),
            'index': tf.size(tokens),
        }

        return transform(preprocessed)

    filter_fn = lambda x: x['labels'] != -1

    # Load dataset.
    dset = load_csv('amazon', split, _preprocess, filter_fn, data_dir=data_dir)

    # Pad remaining examples to the sequence length.
    dset = padded_batch(dset, batch_size, sequence_length)

    return dset