Пример #1
0
def test_preprocessing() -> None:
    df = read_test_data()
    processed = data.preprocess_data(df)

    assert list(processed.iloc[0]['Tokens']) == test_tokens[0]
    assert list(processed.iloc[1]['Tokens']) == test_tokens[1]
    assert list(processed.iloc[2]['Tokens']) == test_tokens[2]
Пример #2
0
def test_word_types() -> None:
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    for tokens in test_tokens:
        assert all(token in word_types for token in tokens)
Пример #3
0
def test_index_uniqueness() -> None:
    "Test if every token has an unique index"
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN)

    for sentence in test_tokens:
        indexes = [word2idx[token] for token in set(sentence)]
        assert len(indexes) == len(set(indexes))
    def __init__(self,
                 data: DataFrameOrFilePath,
                 vocab: Optional[Vocabulary] = None,
                 mode: str = 'sentiment'):
        """Initialise Dataset with data, vocab and mode."""
        if isinstance(data, (Path, str)):
            data = pd.read_csv(data)
        self.data = preprocess_data(data)

        if vocab is None:
            vocab = Vocabulary.build_vocab(self.data)
        self.vocab = vocab

        self.mode = mode
Пример #5
0
def test_linear_dataset_sentiment() -> None:
    df = read_test_data()
    df = preprocess_data(df)
    dataset = MeldLinearTextDataset(df, mode='sentiment')

    assert dataset[0].dialogue_id == 0
    assert dataset[0].utterance_id == 0
    assert dataset[0].label.equal(torch.tensor(2))
    assert len(dataset[0].tokens) == len(test_tokens[0])

    assert dataset[1].dialogue_id == 0
    assert dataset[1].utterance_id == 1
    assert dataset[1].label.equal(torch.tensor(2))
    assert len(dataset[1].tokens) == len(test_tokens[1])
Пример #6
0
def test_build_indexes() -> None:
    "Test if every token has an index and the two-way mapping is right"
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN)

    for sentence in test_tokens:
        for token in sentence:
            assert token in word2idx
            index = word2idx[token]
            assert idx2word[index] == token