def test_init(self): # basic init label_field = data.LabelField() assert label_field.sequential is False assert label_field.unk_token is None # init with preset fields label_field = data.LabelField(sequential=True, unk_token="<unk>") assert label_field.sequential is False assert label_field.unk_token is None
def load_dataset(test_sen=None): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=70) LABEL = data.LabelField() train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) train_data, valid_data = train_data.split( ) # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) # print(TEXT.vocab.stoi["<eos>"]) # print(TEXT.vocab.stoi["<sos>"]) # exit(0) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def test_vocab_size(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.LabelField() # Copied from test_build_vocab with minor changes # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Skipping json dataset as we can rely on the original build vocab test label_field.build_vocab(tsv_dataset) assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) expected_stoi = {'1': 0, '0': 1} # No <unk> assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos
def tokenize_and_cut(sentence): tokens = tokenizer.tokenize(sentence) tokens = tokens[:max_input_length - 2] return tokens MYTEXT = data.Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) MYSENTIMENT = data.LabelField(dtype=torch.float) #INFERENCE def predict_sentiment(model, tokenizer, sentence): model.eval() tokens = tokenizer.tokenize(sentence) tokens = tokens[:max_input_length - 2] indexed = [init_token_idx ] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(0) prediction = torch.sigmoid(model(tensor)) return prediction.item()
fpLabelP3Train = fopOutputML + 'train.label.p3.txt' # fpValid = fopRoot + 'valid.csv' # fpTextValid = fopRoot + 'testP.text.txt' fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt' fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt' fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt' # fpTest = fopRoot + 'test.csv' # fpTextTest = fopRoot + 'testW.text.txt' fpLabelP1Test = fopOutputML + 'testW.label.p1.txt' fpLabelP2Test = fopOutputML + 'testW.label.p2.txt' fpLabelP3Test = fopOutputML + 'testW.label.p3.txt' sys.stdout = open(fpResultDetails, 'w') TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True) fields = [('label', LABEL), ('text', TEXT)] # loading custom dataset p1 train_data = data.TabularDataset(path=fpLabelP1Train, format='csv', fields=fields, skip_header=True) valid_data = data.TabularDataset(path=fpLabelP1Valid, format='csv', fields=fields, skip_header=True) test_data = data.TabularDataset(path=fpLabelP1Test, format='csv', fields=fields, skip_header=True) acc_p1 = trainAndEval(train_data, valid_data, test_data)
from torchtext.legacy import data from torchtext.legacy import datasets import random import numpy as np SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True) LABEL = data.LabelField(dtype=torch.float) # 데이터 정의함 train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # 데이터를 위에서 정의한 형식대로 train,test 데이터 불러오기 train_data, valid_data = train_data.split(random_state=random.seed(SEED)) # 데이터 분리 MAX_VOCAB_SIZE = 25_000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.50d", unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data)
def test_stratified_dataset_split(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Default split ratio expected_train_size = 21 expected_test_size = 9 train, test = dataset.split(stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test strata_field argument train, test = dataset.split(split_ratio=split_ratio, stratified=True, strata_field='label') assert len(train) == expected_train_size assert len(test) == expected_test_size # Test invalid field name strata_field = 'dummy' with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio, stratified=True, strata_field=strata_field) # Test uneven stratify sizes num_examples, num_labels = 28, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) # 10 examples for class 1 and 9 examples for classes 2,3 dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) expected_train_size = 7 + 6 + 6 expected_test_size = 3 + 3 + 3 train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 6 + 5 + 5 expected_valid_size = 1 + 1 + 1 expected_test_size = 3 + 3 + 3 train, valid, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size
def test_dataset_split_arguments(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Test default split ratio (0.7) expected_train_size = 21 expected_test_size = 9 train, test = dataset.split() assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 18 expected_valid_size = 3 expected_test_size = 9 train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test ratio normalization split_ratio = [6, 3, 1] train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test only two splits returned for too small valid split size split_ratio = [0.66, 0.33, 0.01] expected_length = 2 splits = dataset.split(split_ratio=split_ratio) assert len(splits) == expected_length # Test invalid arguments split_ratio = 1.1 with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = -1. with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [0.7] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [1, 2, 3, 4] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = "string" with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio)
def main(config, model_filename): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) model_file = os.path.join(config.output_dir, model_filename) # Prepare the device gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) # Set Random Seeds random.seed(config.seed) torch.manual_seed(config.seed) np.random.seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True # Prepare the data id_field = data.RawField() id_field.is_target = False text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True) label_field = data.LabelField(dtype=torch.long) train_iterator, dev_iterator, test_iterator = load_data( config.data_path, id_field, text_field, label_field, config.train_batch_size, config.dev_batch_size, config.test_batch_size, device, config.glove_word_file, config.cache_dir) # Word Vector word_emb = text_field.vocab.vectors if config.model_name == "GAReader": from Baselines.GAReader.GAReader import GAReader model = GAReader(config.glove_word_dim, config.output_dim, config.hidden_size, config.rnn_num_layers, config.ga_layers, config.bidirectional, config.dropout, word_emb) print(model) # optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = optim.SGD(model.parameters(), lr=config.lr) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_iterator, dev_iterator, optimizer, criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir, config.print_step, config.clip) model.load_state_dict(torch.load(model_file)) test_loss, test_acc, test_report = evaluate(model, test_iterator, criterion, ['0', '1', '2', '3', '4']) print("-------------- Test -------------") print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}". format(test_loss, test_acc, test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
# pip install spacy # python -m spacy download en_core_web_sm from torchtext.legacy import data from tqdm import tqdm import torch import torch.nn as nn from sklearn.metrics import accuracy_score, confusion_matrix device = "cpu" # dataset LABEL = data.LabelField() POST = data.Field(tokenize="spacy", lower=True, tokenizer_language="en_core_web_sm") fields = [("body", POST), ("label", LABEL)] dataset = data.TabularDataset(path="pytorch_data.csv", format="CSV", fields=fields) train, test = dataset.split(split_ratio=[0.8, 0.2]) # vocabulary POST.build_vocab(train, max_size=10000) # , vectors = 'glove.6B.200d') LABEL.build_vocab(train) # fixes `"LabelField" has no attribute "vocab"` # data loaders train_iterator, test_iterator = data.BucketIterator.splits( (train, test), batch_size=32, device=device,