def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) split = check_default_set(split_, ('train', 'test'), dataset_name) raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) # Materialize raw text iterable dataset raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets) } if vocab is None: if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_data["train"], text_transform) logger_.info('Vocab has %d entries', len(vocab)) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) logger_.info('Building datasets for {}'.format(split)) return wrap_datasets( tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in split), split_)
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) data_select = check_default_set(data_select, ('train', 'test')) raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) # Materialize raw text iterable dataset raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets) } if vocab is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data["train"], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) return tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select)
def niid_device(params): num_user = params['Trainer']['n_clients'] dataset_user = params['Dataset']['user'] assert num_user == dataset_user # should be exact same usernames = list(dict(df[4].value_counts()))[:dataset_user] df_small = df.loc[df[4].isin(usernames)] df_small = df_small.sample(frac=1) # shuffle all the data df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :] df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :] text_transform = sequential_transforms( str.lower, get_tokenizer("basic_english"), ) counter = Counter(dict( get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2) )) vocab = Vocab( counter, vectors='glove.6B.300d', vectors_cache='./data/vector_cache/', ) text_transform = sequential_transforms( text_transform, vocab_func(vocab), totensor(dtype=torch.long), ) label_transform = sequential_transforms(totensor(dtype=torch.long)) data_test = list(zip(df_test[0], df_test[5])) test_dataset = TextClassificationDataset( data_test, vocab, (label_transform, text_transform), ) # pandas is easy to split #data_train = list(zip(df_train[0], df_train[5])) #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform)) dataset_split = [] for username in usernames: split_train = df_small.loc[df_small[4] == username] split_train = list(zip(split_train[0], split_train[5])) dataset_split.append( { 'train': TextClassificationDataset( split_train, vocab, (label_transform, text_transform), ), 'test': None, } ) for item in dataset_split: item['vocab'] = vocab testset_dict = { 'train': None, 'test': test_dataset, 'vocab': vocab, } return dataset_split, testset_dict
def process_raw_data(raw_data, tokenizer, vocab): raw_data = [(label, text) for (label, text) in raw_data] text_transform = sequential_transforms(tokenizer.tokenize, vocab_func(vocab), totensor(dtype=torch.long)) label_transform = sequential_transforms(totensor(dtype=torch.long)) transforms = (label_transform, text_transform) dataset = TextClassificationDataset(raw_data, vocab, transforms) return dataset
def _setup_datasets(dataset_name, root, vocabs, data_select): data_select = check_default_set(data_select, ('train', 'valid', 'test')) raw_iter_tuple = raw.DATASETS[dataset_name](root=root, data_select=data_select) raw_data = {} for name, raw_iter in zip(data_select, raw_iter_tuple): raw_data[name] = list(raw_iter) if vocabs is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocabs = build_vocab(raw_data["train"]) else: if not isinstance(vocabs, list): raise TypeError("vocabs must be an instance of list") # Find data that's not None notnone_data = None for key in raw_data.keys(): if raw_data[key] is not None: notnone_data = raw_data[key] break if len(vocabs) != len(notnone_data[0]): raise ValueError( "Number of vocabs must match the number of columns " "in the data") transformers = [ sequential_transforms(vocab_func(vocabs[idx]), totensor(dtype=torch.long)) for idx in range(len(vocabs)) ] return tuple( SequenceTaggingDataset(raw_data[item], vocabs, transformers) for item in data_select)
def _setup_datasets(dataset_name, root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev')): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'dev'))): raise TypeError( 'Given data selection {} is not supported!'.format(data_select)) train, dev = raw.DATASETS[dataset_name](root=root) raw_data = { 'train': [item for item in train], 'dev': [item for item in dev] } if vocab is None: if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): for (_context, _question, _answers, _ans_pos) in data: tok_ans = [] for item in _answers: tok_ans += text_transform(item) yield text_transform(_context) + text_transform( _question) + tok_ans vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), len(raw_data['train'])) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = { 'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long) } return tuple( QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in data_select)
def _setup_datasets(dataset_name, root, vocab, tokenizer, split_): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) split = _check_default_set(split_, ('train', 'dev'), dataset_name) raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets) } if vocab is None: if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): for (_context, _question, _answers, _ans_pos) in data: tok_ans = [] for item in _answers: tok_ans += text_transform(item) yield text_transform(_context) + text_transform( _question) + tok_ans logger_.info('Building Vocab based on train data') vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), specials=['<unk>', '<pad>']) vocab.set_default_index(vocab['<unk>']) logger_.info('Vocab has %d entries', len(vocab)) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = { 'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long) } logger_.info('Building datasets for {}'.format(split)) return _wrap_datasets( tuple( QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in split), split_)
def load_imdb(review, score, vocab): print(f'loading imdb text and score data') with open(review) as f: text = [tokenize(line, max_length) for line in f.readlines()] with open(score) as f: score = [] for real_scroe in f.readlines() : if int(real_scroe) >= 6 : score.append(0) # positive else : score.append(1) # negative text_transform = sequential_transforms( vocab_func(vocab), totensor(torch.long) ) label_transform = sequential_transforms( totensor(torch.long) ) dataset = TextClassificationDataset(list(zip(score, text)), vocab, (label_transform, text_transform)) return dataset
def _setup_datasets( dataset_name, root=".data", ngrams=1, vocab=None, tokenizer=None, data_select=("train", "test"), ): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(("train", "test"))): raise TypeError( "Given data selection {} is not supported!".format(data_select)) train, test = raw.DATASETS[dataset_name](root=root) # Cache raw text iterable dataset raw_data = { "train": [(label, txt) for (label, txt) in train], "test": [(label, txt) for (label, txt) in test], } if vocab is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data["train"], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) return tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select)
def build_legacy_torchtext_vocab_pipeline(vocab_file): tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator def token_iterator(vocab_file): f = open(vocab_file, 'r') for line in f: for token in line: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) pipeline = sequential_transforms(tokenizer, vocab_func(vocab)) return pipeline, None, None
def build_torchtext_vocab(vocab_file): from torchtext.data.utils import get_tokenizer tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator from torchtext.experimental.functional import totensor, vocab_func, sequential_transforms def token_iterator(vocab_file): f = open(vocab_file, 'r') for token in f: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) pipeline = sequential_transforms(tokenizer, vocab_func(vocab), totensor(dtype=torch.long)) return pipeline, None, None
def _setup_datasets(dataset_name, root, vocab, tokenizer, data_select): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) data_select = check_default_set(data_select, ('train', 'dev')) raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets)} if vocab is None: if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): for (_context, _question, _answers, _ans_pos) in data: tok_ans = [] for item in _answers: tok_ans += text_transform(item) yield text_transform(_context) + text_transform(_question) + tok_ans vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), len(raw_data['train'])) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = {'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long)} return tuple(QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in data_select)
def build_legacy_pytext_vocab_pipeline(vocab_file): from pytext.data.utils import Vocabulary tokenizer = get_tokenizer("basic_english") f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = sequential_transforms(tokenizer_func(tokenizer), PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>"))) return pipeline, None, None
def _setup_datasets(dataset_name, root=".data", vocabs=None, data_select=("train", "valid", "test")): if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(("train", "valid", "test"))): raise TypeError( "Given data selection {} is not supported!".format(data_select)) train, val, test = raw.DATASETS[dataset_name](root=root) raw_data = { "train": [line for line in train] if train else None, "valid": [line for line in val] if val else None, "test": [line for line in test] if test else None } if vocabs is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocabs = build_vocab(raw_data["train"]) else: if not isinstance(vocabs, list): raise TypeError("vocabs must be an instance of list") # Find data that's not None notnone_data = None for key in raw_data.keys(): if raw_data[key] is not None: notnone_data = raw_data[key] break if len(vocabs) != len(notnone_data[0]): raise ValueError( "Number of vocabs must match the number of columns " "in the data") transformers = [ sequential_transforms(vocab_func(vocabs[idx]), totensor(dtype=torch.long)) for idx in range(len(vocabs)) ] datasets = [] for item in data_select: if raw_data[item] is not None: datasets.append( SequenceTaggingDataset(raw_data[item], vocabs, transformers)) return datasets
def load_dataset(directory, dev_ratio=None, using_vocab=None): #사용자 지정 함수 , directory 어디서 읽어 올거야, dev_ratio: ?, print(f'loading files in {directory}') text = [] labels = [] classes = os.listdir(directory) # [neg, pos]. 해당 directory에 있는 또다른 directory를 가지고 온다. for directory_name in classes: for filename in tqdm.tqdm(os.listdir(f'{directory}/{directory_name}'), desc=f'loading {directory_name}'): with open(f'{directory}/{directory_name}/{filename}', encoding='utf-8') as f: tokens = tokenize(f.read(), max_length) text.append(tokens) labels.append(directory_name) if dev_ratio is not None: text, dev_text, labels, dev_labels = train_test_split(text, labels, test_size=0.1) if using_vocab is None: using_vocab = make_vocab(text, vocab_size) text_transform = sequential_transforms( vocab_func(using_vocab), totensor(torch.long) ) label_map = {name: index for index, name in enumerate(classes)} print(label_map) label_transform = sequential_transforms( lambda label: label_map[label], totensor(torch.long) ) dataset = TextClassificationDataset(list(zip(labels, text)), using_vocab, (label_transform, text_transform)) if dev_ratio is not None: dev_dataset = TextClassificationDataset(list(zip(dev_labels, dev_text)), using_vocab, (label_transform, text_transform)) return dataset, dev_dataset else: return dataset
def build_legacy_batch_torchtext_vocab_pipeline(vocab_file): tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator from transforms import TextClassificationPipeline def token_iterator(vocab_file): f = open(vocab_file, 'r') for line in f: for token in line: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) text_pipeline = sequential_transforms(tokenizer, vocab_func(vocab)) label_pipeline = totensor(dtype=torch.long) return TextClassificationPipeline(label_pipeline, text_pipeline), None, None
def niid(params): num_user = params['Trainer']['n_clients'] dataset_frac = params['Dataset']['frac'] s = params['Dataset']['s'] df_small = df.sample(frac=dataset_frac) # sample & shuffle df_train = df_small.iloc[:int(df_small.shape[0] * 0.9), :] df_test = df_small.iloc[int(df_small.shape[0] * 0.9):, :] text_transform = sequential_transforms( str.lower, get_tokenizer("basic_english"), ) counter = Counter(dict( get_vocab_counter(df_train[5], text_transform).most_common(3000 - 2) )) vocab = Vocab( counter, vectors='glove.6B.300d', vectors_cache='./data/vector_cache/', ) text_transform = sequential_transforms( text_transform, vocab_func(vocab), totensor(dtype=torch.long), ) label_transform = sequential_transforms(totensor(dtype=torch.long)) data_test = list(zip(df_test[0], df_test[5])) test_dataset = TextClassificationDataset( data_test, vocab, (label_transform, text_transform), ) # pandas is easy to split #data_train = list(zip(df_train[0], df_train[5])) #train_dataset = TextClassificationDataset(data_train, vocab, (label_transform, text_transform)) df_train_iid = df_train.iloc[:int(s * df_train.shape[0]), :] df_train_niid = df_train.iloc[int(s * df_train.shape[0]):, :].sort_values([0]) p_train_iid = 0 p_train_niid = 0 delta_train_iid = df_train_iid.shape[0] // num_user delta_train_niid = df_train_niid.shape[0] // num_user dataset_split = [] for userid in range(num_user): train_lst = [] if delta_train_iid > 0: train_lst.append( df_train_iid[ p_train_iid: p_train_iid + delta_train_iid ] ) if delta_train_niid > 0: train_lst.append( df_train_niid[ p_train_niid: p_train_niid + delta_train_niid ] ) split_train = pd.concat(train_lst) split_train = list(zip(split_train[0], split_train[5])) dataset_split.append( { 'train': TextClassificationDataset( split_train, vocab, (label_transform, text_transform), ), 'test': None, } ) p_train_iid += delta_train_iid p_train_niid += delta_train_niid for item in dataset_split: item['vocab'] = vocab testset_dict = { 'train': None, 'test': test_dataset, 'vocab': vocab, } return dataset_split, testset_dict
def build_legacy_fasttext_vector_pipeline(): tokenizer = get_tokenizer("basic_english") vector = FastText() pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens) return pipeline, None, None
from torchtext.experimental.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor # load data from whatever format it's saved in to an iterable of (label, text) my_data = [('pos', 'this film is great'), ('neg', 'this film is bad'), ('neg', 'this film is awful')] # tokenizer can be any callable function that goes from str -> list[str] my_tokenizer = get_tokenizer('basic_english') # build vocabulary from data my_vocab = build_vocab_from_iterator( [my_tokenizer(text) for label, text in my_data]) # how should the label be transformed? # str -> int -> LongTensor label_transforms = sequential_transforms(lambda x: 1 if x == 'pos' else 0, totensor(torch.long)) # how should the text be transformed? # str -> list[str] -> list[int] -> LongTensor text_transforms = sequential_transforms(my_tokenizer, vocab_func(my_vocab), totensor(torch.long)) # tuple the transforms my_transforms = (label_transforms, text_transforms) # create TextClassificationDataset with data, vocabulary and transforms dataset = TextClassificationDataset(my_data, my_vocab, my_transforms)
def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, data_select=('train', 'test', 'valid'), single_line=True): if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError( 'Given data selection {} is not supported!'.format(data_select)) if not single_line and dataset_name != 'WikiText103': raise TypeError('single_line must be True except for WikiText103') if dataset_name == 'WMTNewsCrawl': train, = raw.DATASETS[dataset_name](root=root, data_select=('train', )) if single_line: raw_data = { 'train': [ " ".join([txt for txt in train]), ] } else: raw_data = {'train': [txt for txt in train]} else: train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) # Cache raw text iterable dataset if single_line: raw_data = { 'train': [ " ".join([txt for txt in train]), ], 'valid': [ " ".join(txt for txt in valid), ], 'test': [ " ".join(txt for txt in test), ] } else: raw_data = { 'train': [txt for txt in train], 'valid': [txt for txt in valid], 'test': [txt for txt in test] } if vocab is None: if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data['train'], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) return tuple( LanguageModelingDataset(raw_data[item], vocab, text_transform, single_line) for item in data_select)