示例#1
0
def resplit_datasets(dataset, other_dataset, random_seed=None, cut=None):
    """ Deterministic shuffle and split algorithm.

    Given the same two datasets and the same `random_seed`, the split happens the same exact way
    every call.

    Args:
        dataset (torchnlp.datasets.Dataset)
        other_dataset (torchnlp.datasets.Dataset)
        random_seed (int, optional)
        cut (float, optional): float between 0 and 1 to cut the dataset; otherwise, the same
            proportions are kept.
    Returns:
        dataset (torchnlp.datasets.Dataset)
        other_dataset (torchnlp.datasets.Dataset)
    """
    concat = dataset.rows + other_dataset.rows
    # Reference:
    # https://stackoverflow.com/questions/19306976/python-shuffling-with-a-parameter-to-get-the-same-result
    # NOTE: Shuffle the same way every call of `shuffle_datasets` where the `random_seed` is given
    random.Random(random_seed).shuffle(concat)
    if cut is None:
        return Dataset(concat[:len(dataset)]), Dataset(concat[len(dataset):])
    else:
        cut = max(min(round(len(concat) * cut), len(concat)), 0)
        return Dataset(concat[:cut]), Dataset(concat[cut:])
示例#2
0
def test_resplit_datasets():
    a = Dataset([{'r': 1}, {'r': 2}, {'r': 3}, {'r': 4}, {'r': 5}])
    b = Dataset([{'r': 6}, {'r': 7}, {'r': 8}, {'r': 9}, {'r': 10}])
    # Test determinism
    a, b = resplit_datasets(a, b, random_seed=123)
    assert list(a) == [{'r': 9}, {'r': 8}, {'r': 6}, {'r': 10}, {'r': 3}]
    assert list(b) == [{'r': 4}, {'r': 7}, {'r': 2}, {'r': 5}, {'r': 1}]
示例#3
0
def resplit_datasets(dataset, other_dataset, random_seed=None, split=None):
    """Deterministic shuffle and split algorithm.

    Given the same two datasets and the same ``random_seed``, the split happens the same exact way
    every call.

    Args:
        dataset (lib.datasets.Dataset): First dataset.
        other_dataset (lib.datasets.Dataset): Another dataset.
        random_seed (int, optional): Seed to control the shuffle of both datasets.
        split (float, optional): If defined it is the percentage of rows that first dataset gets
            after split otherwise the original proportions are kept.

    Returns:
        :class:`lib.datasets.Dataset`, :class:`lib.datasets.Dataset`: Resplit datasets.
    """
    # Prevent circular dependency
    from torchnlp.datasets import Dataset

    concat = dataset.rows + other_dataset.rows
    shuffle(concat, random_seed=random_seed)
    if split is None:
        return Dataset(concat[:len(dataset)]), Dataset(concat[len(dataset):])
    else:
        split = max(min(round(len(concat) * split), len(concat)), 0)
        return Dataset(concat[:split]), Dataset(concat[split:])
示例#4
0
def parse_dataset(path,
                  label_to_idx,
                  word_to_idx,
                  pos_target=False,
                  pad_len=None,
                  encoding='latin-1',
                  max_len=100):
    sentences = []
    UNK = 3
    PAD = 1
    target_index = 1 if pos_target else 3
    nr_long = 0
    max_sus = 0
    with open(path, encoding=encoding) as f:

        sample = {'word_ids': [], 'labels': []}
        max_len_token = 0
        for line in f.read().splitlines():
            if line in ['\n', '\r\n', '']:  # end of sequence
                if len(sample['labels']) > 100:
                    nr_long += 1
                if (len(sample['labels']) > 0) and (len(sample['word_ids']) <
                                                    max_len):
                    max_sus = max(max_sus, len(sample['word_ids']))
                    sample['labels'] = torch.LongTensor(sample['labels'])
                    sentences.append(sample)
                sample = {'word_ids': [], 'labels': []}
                continue
            else:
                ls = line.split()
                max_len_token = max(max_len_token, len(ls[4:]))
                word = ls[4:]
                label = ls[target_index]
                if len(word) > 0:
                    word_ids = [
                        word_to_idx[w] if w in word_to_idx.keys() else UNK
                        for w in word
                    ]
                    sample['word_ids'].append(
                        torch.LongTensor(word_ids))  # 3 -> <unk>
                    sample['labels'].append(label_to_idx[label])
                    if len(word_ids) > 20:
                        print(line)

    # padd all BPE encodings to max length in dataset
    if pad_len is not None:
        max_len_token = max(pad_len, max_len_token)
    for s in range(len(sentences)):
        sen = sentences[s]
        for i in range(len(sen['word_ids'])):
            sen['word_ids'][i] = pad_tensor(sen['word_ids'][i],
                                            length=max_len_token,
                                            padding_index=PAD)

        # stack word ids back together
        sen['word_ids'] = torch.stack(sen['word_ids'], dim=0).view(-1)
    print('max nr of SUs in sentence: {}'.format(max_sus))
    print('Number of long sentences: {}'.format(nr_long))

    return Dataset(sentences), max_len_token
示例#5
0
def test_dataset_set_row():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    dataset[0] = {'c': 'c'}
    assert dataset['c'] == ['c', None]
    assert dataset['a'] == [None, 'aa']

    dataset[0:2] = [{'d': 'd'}, {'d': 'dd'}]
    assert dataset[0] == {'d': 'd'}
    assert dataset[1] == {'d': 'dd'}

    with pytest.raises(IndexError):
        dataset[2] = {'c': 'c'}
示例#6
0
def process_dataset(
    docs,
    label_to_idx,
    word_to_idx=None,
    word_counter=None,
    unk="<UNK>",
    pad="<PAD>",
    pad_idx=0,
    unk_idx=1,
    min_freq_word=50,
    label_value=1.0,
    binary_class=True,
):
    """"
		Process list of docs into Pytorch-ready dataset
	"""
    dset = []
    tag_counter = Counter()
    stoi = None

    if min_freq_word:
        word_counter = Counter(
            [w for doc in docs for sent in doc.sentences for w in sent])

    if word_to_idx is None:
        word_to_idx = OrderedDict()
        word_to_idx[pad] = pad_idx
        word_to_idx[unk] = unk_idx
    elif min_freq_word:
        stoi = {
            k: v
            for k, v in word_to_idx.items()
            if (word_counter[k] >= min_freq_word) or (k in [pad, unk])
        }

    print("Loading and converting docs to PyTorch backend...")
    for doc in docs:
        sample, tag_counter = doc_to_sample(
            doc,
            label_to_idx,
            word_to_idx,
            word_counter,
            stoi=stoi,
            min_freq_word=min_freq_word,
            unk=unk,
            tag_counter=tag_counter,
            label_value=label_value,
            binary_class=binary_class,
        )

        dset.append(sample)

    return Dataset(dset), word_to_idx, tag_counter
示例#7
0
def test_dataset_set_column():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])

    # Regular column update
    dataset['a'] = ['aa', 'aaa']
    assert dataset['a'] == ['aa', 'aaa']

    # To Little
    dataset['b'] = ['b']
    assert dataset['b'] == ['b', None]

    # Too many
    dataset['c'] = ['c', 'cc', 'ccc']
    assert dataset['c'] == ['c', 'cc', 'ccc']

    # Smoke (regression test)
    random.shuffle(dataset)
示例#8
0
def parse_dataset_laser(path,
                        label_to_idx,
                        word_to_idx,
                        pos_target=False,
                        encoding='latin-1',
                        max_len=100):
    sentences = []
    UNK = 3
    PAD = 1
    target_index = 1 if pos_target else 3

    with open(path, encoding=encoding) as f:

        sample = {'word_ids': [], 'labels': [], 'word_len': []}
        max_len_token = 0
        for line in f.read().splitlines():
            if line in ['\n', '\r\n', '']:  # end of sequence
                if (len(sample['labels']) > 0) and (len(sample['word_ids']) <
                                                    max_len):
                    sample['labels'] = torch.LongTensor(sample['labels'])
                    sample['word_ids'] = torch.LongTensor(sample['word_ids'])
                    sample['word_len'] = torch.LongTensor(sample['word_len'])
                    sentences.append(sample)
                sample = {'word_ids': [], 'labels': [], 'word_len': []}
                continue
            else:
                ls = line.split()
                max_len_token = max(max_len_token, len(ls[4:]))
                word = ls[4:]
                label = ls[target_index]
                if len(word) > 0:
                    word_ids = [
                        word_to_idx[w.lower()]
                        if w.lower() in word_to_idx.keys() else UNK
                        for w in word
                    ]
                    sample['word_ids'].extend(word_ids)  # 3 -> <unk>
                    sample['word_len'].append(len(word_ids))

                    sample['labels'].append(label_to_idx[label])

                    if len(word_ids) > 20:
                        print(line)
    return Dataset(sentences), max_len_token
示例#9
0
def parse_dataset_muse(path,
                       label_to_idx,
                       word_to_idx=None,
                       pos_target=False,
                       encoding='utf-8',
                       max_len=150):

    target_index = 1 if pos_target else 3
    sentences = []
    if word_to_idx is None:
        word_to_idx = OrderedDict()
        word_num = 0
    else:
        word_num = len(word_to_idx)
    with open(path, encoding=encoding) as f:

        sample = {'word_ids': [], 'labels': []}
        for line in f.read().splitlines():

            if line in ['\n', '\r\n', '']:  # end of sequence
                if len(sample['labels']) > 0 and (len(sample['word_ids']) <
                                                  max_len):
                    sample['word_ids'] = torch.LongTensor(sample['word_ids'])
                    sample['labels'] = torch.LongTensor(sample['labels'])
                    sentences.append(sample)
                sample = {'word_ids': [], 'labels': []}
                continue
            else:
                ls = line.split()
                word = ls[0].lower()
                label = ls[target_index]

                if word not in word_to_idx.keys():
                    word_to_idx[word] = word_num
                    word_num += 1
                sample['word_ids'].append(word_to_idx[word])
                sample['labels'].append(label_to_idx[label])

    return Dataset(sentences), word_to_idx
示例#10
0
def random_dataset(input_key='input',
                   output_key='output',
                   input_generator=random_sequence,
                   output_generator=random_sequence,
                   input_encoder=WhitespaceEncoder,
                   output_encoder=WhitespaceEncoder,
                   size=random.randint(1, 100)):
    """
    Returns:
        (torchnlp.datasets.Dataset) dataset over random data
    """
    rows = []
    for _ in range(size):
        row = {}
        row[input_key] = input_generator()
        row[output_key] = output_generator()
        rows.append(row)
    dataset = Dataset(rows)
    input_encoder = input_encoder(dataset[input_key])
    output_encoder = output_encoder(dataset[output_key])
    for row in dataset:
        row[input_key] = input_encoder.encode(row[input_key])
        row[output_key] = output_encoder.encode(row[output_key])
    return dataset, input_encoder, output_encoder
示例#11
0
def test_resplit_datasets_cut():
    a = Dataset([{'r': 1}, {'r': 2}, {'r': 3}, {'r': 4}, {'r': 5}])
    b = Dataset([{'r': 6}, {'r': 7}, {'r': 8}, {'r': 9}, {'r': 10}])
    a, b = resplit_datasets(a, b, random_seed=123, split=0.3)
    assert len(a) == 3
    assert len(b) == 7
示例#12
0
def test_dataset_init():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    assert len(dataset) == 2
    assert 'a' in dataset
    assert 'b' in dataset
    assert 'c' not in dataset
示例#13
0
def test_dataset_concat():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    other_dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    concat = dataset + other_dataset
    assert len(concat) == 4
    assert list(concat) == dataset.rows + other_dataset.rows
示例#14
0
def test_dataset_equality():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    other_dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    assert dataset == other_dataset
示例#15
0
def test_dataset_get_row():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    assert dataset[0] == {'a': 'a', 'b': 'b'}
    assert dataset[1] == {'a': 'aa', 'b': 'bb'}
    with pytest.raises(IndexError):
        dataset[2]
示例#16
0
def test_dataset_get_column():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    assert dataset['a'] == ['a', 'aa']
    assert dataset['b'] == ['b', 'bb']
    with pytest.raises(AttributeError):
        dataset['c']
示例#17
0
def test_dataset_str():
    dataset = Dataset([{'a': 'a', 'b': 'b'}, {'a': 'aa', 'b': 'bb'}])
    assert '    a   b\n0   a   b\n1  aa  bb' == str(dataset)