Exemplo n.º 1
0
def niid_device(params):
    num_user = params['Trainer']['n_clients']
    dataset_user = params['Dataset']['user']
    assert num_user == dataset_user # should be exact same
    usernames = list(dict(df[4].value_counts()))[:dataset_user]
    df_small = df.loc[df[4].isin(usernames)]
    df_small = df_small.sample(frac=1) # shuffle all the data
    df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :]
    df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :]
    text_transform = sequential_transforms(
        str.lower, 
        get_tokenizer("basic_english"),
    )
    counter = Counter(dict(
        get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2)
    ))
    vocab = Vocab(
        counter, 
        vectors='glove.6B.300d', 
        vectors_cache='./data/vector_cache/',
    )
    text_transform = sequential_transforms(
        text_transform, 
        vocab_func(vocab), 
        totensor(dtype=torch.long), 
    )
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    data_test = list(zip(df_test[0], df_test[5]))
    test_dataset = TextClassificationDataset(
        data_test, 
        vocab, 
        (label_transform, text_transform),
    )
    # pandas is easy to split
    #data_train = list(zip(df_train[0], df_train[5]))
    #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform))
    dataset_split = []
    for username in usernames:
        split_train = df_small.loc[df_small[4] == username]
        split_train = list(zip(split_train[0], split_train[5]))
        dataset_split.append(
            {
                'train': TextClassificationDataset(
                    split_train, 
                    vocab, 
                    (label_transform, text_transform),
                ),
                'test': None, 
            }
        )
    for item in dataset_split: item['vocab'] = vocab
    testset_dict = {
        'train': None,
        'test': test_dataset,
        'vocab': vocab,
    }
    return dataset_split, testset_dict
def process_raw(raw_data, tokenizer, vocab):
    raw_data = [(label, text) for (label, text) in raw_data]
    text_trans = sequential_transforms(tokenizer.tokenize, vocab_func(vocab),
                                       to_tensor(dtype=torch.long))
    label_trans = sequential_transforms(to_tensor(dtype=torch.long))

    transforms = (label_trans, text_trans)

    return TextClassificationDataset(raw_data, vocab, transforms)
def load_dataset(directory, dev_ratio=None, using_vocab=None): #사용자 지정 함수 , directory 어디서 읽어 올거야, dev_ratio: ?, 
    print(f'loading files in {directory}')
    text = []
    labels = []
    classes = os.listdir(directory) # [neg, pos]. 해당 directory에 있는 또다른 directory를 가지고 온다. 
    for directory_name in classes:
        for filename in tqdm.tqdm(os.listdir(f'{directory}/{directory_name}'), desc=f'loading {directory_name}'):
            with open(f'{directory}/{directory_name}/{filename}', encoding='utf-8') as f:
                tokens = tokenize(f.read(), max_length)
                text.append(tokens)
                labels.append(directory_name)

    if dev_ratio is not None:
        text, dev_text, labels, dev_labels = train_test_split(text, labels, test_size=0.1)

    if using_vocab is None:
        using_vocab = make_vocab(text, vocab_size)

    text_transform = sequential_transforms(
        vocab_func(using_vocab),
        totensor(torch.long)
    )
    label_map = {name: index for index, name in enumerate(classes)}
    print(label_map)
    label_transform = sequential_transforms(
        lambda label: label_map[label],
        totensor(torch.long)
    )

    dataset = TextClassificationDataset(list(zip(labels, text)), using_vocab, (label_transform, text_transform))

    if dev_ratio is not None:
        dev_dataset = TextClassificationDataset(list(zip(dev_labels, dev_text)), using_vocab, (label_transform, text_transform))
        return dataset, dev_dataset
    else:
        return dataset
def load_imdb(review, score, vocab): 
    print(f'loading imdb text and score data')
    with open(review) as f:
        text = [tokenize(line, max_length) for line in f.readlines()]
    with open(score) as f:
        score = [] 
        for real_scroe in f.readlines() : 
            if int(real_scroe) >= 6 : 
                score.append(0) # positive
            else : 
                score.append(1) # negative

    text_transform = sequential_transforms(
        vocab_func(vocab),
        totensor(torch.long)
    )
    label_transform = sequential_transforms(
        totensor(torch.long)
    )
    dataset = TextClassificationDataset(list(zip(score, text)), vocab, (label_transform, text_transform))
    return dataset
Exemplo n.º 5
0
def niid(params):
    num_user = params['Trainer']['n_clients']
    dataset_frac = params['Dataset']['frac']
    s = params['Dataset']['s']
    df_small = df.sample(frac=dataset_frac) # sample & shuffle
    df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :]
    df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :]
    text_transform = sequential_transforms(
        str.lower, 
        get_tokenizer("basic_english"),
    )
    counter = Counter(dict(
        get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2)
    ))
    vocab = Vocab(
        counter, 
        vectors='glove.6B.300d', 
        vectors_cache='./data/vector_cache/',
    )
    text_transform = sequential_transforms(
        text_transform, 
        vocab_func(vocab), 
        totensor(dtype=torch.long), 
    )
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    data_test = list(zip(df_test[0], df_test[5]))
    test_dataset = TextClassificationDataset(
        data_test, 
        vocab, 
        (label_transform, text_transform),
    )
    # pandas is easy to split
    #data_train = list(zip(df_train[0], df_train[5]))
    #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform))
    df_train_iid = df_train.iloc[:int(s * df_train.shape[0]), :]
    df_train_niid = df_train.iloc[int(s * df_train.shape[0]):, :].sort_values([0])
    p_train_iid = 0
    p_train_niid = 0
    delta_train_iid = df_train_iid.shape[0] // num_user
    delta_train_niid = df_train_niid.shape[0] // num_user
    dataset_split = []
    for userid in range(num_user):
        train_lst = []
        if delta_train_iid > 0:
            train_lst.append(
                df_train_iid[
                    p_train_iid: p_train_iid + delta_train_iid
                ]
            )
        if delta_train_niid > 0:
            train_lst.append(
                df_train_niid[
                    p_train_niid: p_train_niid + delta_train_niid
                ]
            )
        split_train = pd.concat(train_lst)
        split_train = list(zip(split_train[0], split_train[5]))
        dataset_split.append(
            {
                'train': TextClassificationDataset(
                    split_train, 
                    vocab, 
                    (label_transform, text_transform),
                ),
                'test': None, 
            }
        )
        p_train_iid += delta_train_iid
        p_train_niid += delta_train_niid
    for item in dataset_split: item['vocab'] = vocab
    testset_dict = {
        'train': None,
        'test': test_dataset,
        'vocab': vocab,
    }
    return dataset_split, testset_dict
Exemplo n.º 6
0
from torchtext.experimental.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor

# load data from whatever format it's saved in to an iterable of (label, text)
my_data = [('pos', 'this film is great'), ('neg', 'this film is bad'),
           ('neg', 'this film is awful')]

# tokenizer can be any callable function that goes from str -> list[str]
my_tokenizer = get_tokenizer('basic_english')

# build vocabulary from data
my_vocab = build_vocab_from_iterator(
    [my_tokenizer(text) for label, text in my_data])

# how should the label be transformed?
# str -> int -> LongTensor
label_transforms = sequential_transforms(lambda x: 1 if x == 'pos' else 0,
                                         totensor(torch.long))

# how should the text be transformed?
# str -> list[str] -> list[int] -> LongTensor
text_transforms = sequential_transforms(my_tokenizer, vocab_func(my_vocab),
                                        totensor(torch.long))

# tuple the transforms
my_transforms = (label_transforms, text_transforms)

# create TextClassificationDataset with data, vocabulary and transforms
dataset = TextClassificationDataset(my_data, my_vocab, my_transforms)