예제 #1
0
def test_split_dataset_two_sets():
    sample1 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 1})
    sample2 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 2})
    sample3 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 3})
    sample4 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 4})
    train, test = split_dataset([sample1, sample2, sample3, sample4],
                                [0.5, 0.5])
    assert len(train) == 2
    assert len(test) == 2
예제 #2
0
def test_split_dataset_test_with_smallish_ratio():
    sample1 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 1})
    sample2 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 2})
    sample3 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 3})
    sample4 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 4})
    dataset = [sample1, sample2, sample3, sample4]

    train, test, zero = split_dataset(dataset, [0.5, 0.4999995, 0.0000005])
    assert len(train) == 2
    assert len(test) == 2
    assert len(zero) == 0
def test_split_dataset_two_sets():
    sample1 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample2 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2
    )
    sample3 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3
    )
    sample4 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )
    train, test = split_dataset([sample1, sample2, sample3, sample4], [0.5, 0.5])
    assert len(train) == 2
    assert len(test) == 2
def test_split_dataset_test_with_0_ratio():
    sample1 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample2 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2
    )
    sample3 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3
    )
    sample4 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )
    dataset = [sample1, sample2, sample3, sample4]
    with pytest.raises(ValueError):
        train, test, zero = split_dataset(dataset, [0.5, 0.5, 0])
def test_split_dataset_four_sets(mock_4_samples):

    train, test, val, dev = split_dataset(mock_4_samples, [0.25, 0.25, 0.25, 0.25])
    assert len(train) == 1
    assert len(test) == 1
    assert len(val) == 1
    assert len(dev) == 1

    # make sure all original template IDs are in the new sets

    original_keys = set([1, 2, 3, 4])
    t1 = set([sample.template_id for sample in train])
    t2 = set([sample.template_id for sample in test])
    t3 = set([sample.template_id for sample in dev])
    t4 = set([sample.template_id for sample in val])

    assert original_keys == t1 | t2 | t3 | t4
예제 #6
0
def test_split_dataset_four_sets():
    sample1 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 1})
    sample2 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 2})
    sample3 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 3})
    sample4 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 4})
    dataset = [sample1, sample2, sample3, sample4]
    train, test, val, dev = split_dataset(dataset, [0.25, 0.25, 0.25, 0.25])
    assert len(train) == 1
    assert len(test) == 1
    assert len(val) == 1
    assert len(dev) == 1

    # make sure all original template IDs are in the new sets

    original_keys = set([1, 2, 3, 4])
    t1 = set([sample.metadata['Template#'] for sample in train])
    t2 = set([sample.metadata['Template#'] for sample in test])
    t3 = set([sample.metadata['Template#'] for sample in dev])
    t4 = set([sample.metadata['Template#'] for sample in val])

    assert original_keys == t1 | t2 | t3 | t4