Exemplo n.º 1
0
def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)
Exemplo n.º 2
0
def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences,
                                                      dicts,
                                                      split_parameter,
                                                      extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]
Exemplo n.º 3
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """

    # Pre-processing dataset
    with open(path, 'rb') as dataset:
        data = dataset.readlines()
        # data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [str(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]
    print('This is the labels', type(labels), labels)

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    return {
        'texts': texts,
        'labels': labels,
        'added': added,
        'batch_size': batch_size,
        'maxlen': maxlen
    }
Exemplo n.º 4
0
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
Exemplo n.º 5
0
def convert_dataset(filepath, extend_with, vocab):
    print('-- Generating {} '.format(filepath))
    sys.stdout.flush()
    st = SentenceTokenizer(vocab, maxlen)
    tokenized, dicts, _ = st.split_train_val_test(texts,
                                                  labels,
                                                  [data['train_ind'],
                                                   data['val_ind'],
                                                   data['test_ind']],
                                                  extend_with=extend_with)
    pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
                         dicts[0], dicts[1], dicts[2])
    with open(filepath, 'w') as f:
        pickle.dump(pick, f)
    cover = coverage(tokenized[2])

    print('     done. Coverage: {}'.format(cover))
Exemplo n.º 6
0
def test_dataset_split_parameter():
    """ Dataset is split in the desired ratios
    """
    split_parameter = [0.7, 0.1, 0.2]
    st = SentenceTokenizer(vocab, 30)

    result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
                                                      split_parameter, extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    assert len(train) == len(sentences) * split_parameter[0]
    assert len(val) == len(sentences) * split_parameter[1]
    assert len(test) == len(sentences) * split_parameter[2]

    assert len(train_dicts) == len(dicts) * split_parameter[0]
    assert len(val_dicts) == len(dicts) * split_parameter[1]
    assert len(test_dicts) == len(dicts) * split_parameter[2]
Exemplo n.º 7
0
def test_dataset_split_explicit():
    """ Dataset is split according to given indices
    """
    split_parameter = [train_ind, val_ind, test_ind]
    st = SentenceTokenizer(vocab, 30)
    tokenized, _, _ = st.tokenize_sentences(sentences)

    result, result_dicts, added = st.split_train_val_test(sentences,
                                                          dicts,
                                                          split_parameter,
                                                          extend_with=0)
    train = result[0]
    val = result[1]
    test = result[2]

    train_dicts = result_dicts[0]
    val_dicts = result_dicts[1]
    test_dicts = result_dicts[2]

    for i, sentence in enumerate(sentences):
        if i in train_ind:
            assert tokenized[i] in train
            assert dicts[i] in train_dicts
        elif i in val_ind:
            assert tokenized[i] in val
            assert dicts[i] in val_dicts
        elif i in test_ind:
            assert tokenized[i] in test
            assert dicts[i] in test_dicts

    assert len(train) == len(train_ind)
    assert len(val) == len(val_ind)
    assert len(test) == len(test_ind)
    assert len(train_dicts) == len(train_ind)
    assert len(val_dicts) == len(val_ind)
    assert len(test_dicts) == len(test_ind)
Exemplo n.º 8
0
def load_non_benchmark(data, vocab, extend_with=0):
    # Decode data
    try:
        texts = [x for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   extend_with=extend_with)
    return {'texts': texts,
            'labels': labels,
            'added': added,
            'batch_size': batch_size,
            'maxlen': maxlen}
for p in DATASET_PATHS:
    coverage_result = [p]
    print('Calculating coverage for {}'.format(p))
    with open(p) as f:
        s = pickle.load(f)

    # Decode data
    try:
        s['texts'] = [str(x) for x in s['texts']]
    except UnicodeDecodeError:
        s['texts'] = [x.decode('utf-8') for x in s['texts']]

    # Own
    st = SentenceTokenizer({}, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=10000)
    coverage_result.append(coverage(tests[2]))

    # Last
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
        s['info'], [s['train_ind'], s['val_ind'], s['test_ind']],
        extend_with=0)
    coverage_result.append(coverage(tests[2]))

    # Full
    st = SentenceTokenizer(vocab, 30)
    tests, dicts, _ = st.split_train_val_test(
        s['texts'],
Exemplo n.º 10
0
    u'I am sentence 7',
    u'I am sentence 8',
    u'I am sentence 9 newword',
]

INFO_DICTS = [
    {'label': 'sentence 0'},
    {'label': 'sentence 1'},
    {'label': 'sentence 2'},
    {'label': 'sentence 3'},
    {'label': 'sentence 4'},
    {'label': 'sentence 5'},
    {'label': 'sentence 6'},
    {'label': 'sentence 7'},
    {'label': 'sentence 8'},
    {'label': 'sentence 9'},
]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(st.split_train_val_test(DATASET,
                              INFO_DICTS,
                              [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                              extend_with=1))
    },
    {
        'label': 'sentence 5'
    },
    {
        'label': 'sentence 6'
    },
    {
        'label': 'sentence 7'
    },
    {
        'label': 'sentence 8'
    },
    {
        'label': 'sentence 9'
    },
]

with open('../model/vocabulary.json', 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

# Split using the default split ratio
print(st.split_train_val_test(DATASET, INFO_DICTS))

# Split explicitly
print(
    st.split_train_val_test(DATASET,
                            INFO_DICTS, [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
                            extend_with=1))