def load_train_data(data_path, batch_size, max_src_len, max_trg_len, use_cuda=False): # Note: sequential=False, use_vocab=False, since we use preprocessed inputs. src_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True, pad_token=PAD, unk_token=UNK, init_token=None, eos_token=None,) trg_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True, pad_token=PAD, unk_token=UNK, init_token=BOS, eos_token=EOS,) fields = (src_field, trg_field) device = None if use_cuda else -1 def filter_pred(example): if len(example.src) <= max_src_len and len(example.trg) <= max_trg_len: return True return False dataset = torch.load(data_path) train_src, train_tgt = dataset['train_src'], dataset['train_tgt'] dev_src, dev_tgt = dataset['dev_src'], dataset['dev_tgt'] train_data = ParallelDataset(train_src, train_tgt, fields=fields, filter_pred=filter_pred,) train_iter = Iterator(dataset=train_data, batch_size=batch_size, train=True, # Variable(volatile=False) sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), repeat=False, shuffle=True, device=device) dev_data = ParallelDataset(dev_src, dev_tgt, fields=fields,) dev_iter = Iterator(dataset=dev_data, batch_size=batch_size, train=False, # Variable(volatile=True) repeat=False, device=device, shuffle=False, sort=False,) return src_field, trg_field, train_iter, dev_iter
def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100): """ This assumes that the data is already pre-processed using Moses Tokenizer Returns iterators for the training/dev dataset Arguments: data_path: path of the dataset train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type) dev_batch_size: batch size of the dev data (usually one) max_len: max length of sequeences in a batch """ SRC = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) TRG = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) # create a TranslationDataset for both the train and dev set train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=( SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len) dev_data = datasets.TranslationDataset( exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path) # load in the Test Set test_examples = [] with open(data_path + "test.de", "r") as f: for test_example in f.readlines(): example = data.Example() setattr(example, "src", test_example.split()) test_examples.append(example) test_data = data.Dataset(test_examples, fields=[("src", SRC)]) # build he vocab using the training data SRC.build_vocab(train_data.src, train_data.trg) TRG.build_vocab(train_data.src, train_data.trg) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use custom DataIterator in order to minimize padding in a sequence # and inoder to `pack` a batch fully inorder to maximmize the computation # in a GPU train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True) # use a regular Iterator since we want to be able to compare # our translations to a gold standard file. If we use a # `DataIterator` then we will get our translations in shuffled/random # order dev_iterator = Iterator(dev_data, batch_size=dev_batch_size, train=False, sort=False, repeat=False, device=device) # create Test Iterator for the test data test_iterator = Iterator( test_data, batch_size=1, train=False, sort=False, repeat=False, device=device) print(len(test_iterator)) return train_iterator, dev_iterator, test_iterator, SRC, TRG
def tokenize(self): ENGLISH = Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, init_token="<sos>", eos_token="<eos>") FRENCH = Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, init_token="<sos>", eos_token="<eos>") """ in order for this to work, change "csv.field_size_limit(sys.maxsize)" in torchtext/utils.py to "csv.field_size_limit(maxInt)" """ train, test = TabularDataset.splits(path='./data/', train='train.csv', test='test.csv', format='csv', fields=[('en',ENGLISH),('fr',FRENCH)]) ENGLISH.build_vocab(train, test) FRENCH.build_vocab(train, test) self.en_vocab = ENGLISH self.fr_vocab = FRENCH self.en_vocabsize = len(ENGLISH.vocab) self.fr_vocabsize = len(FRENCH.vocab) if self.config.debug : train_loader, test_loader = Iterator.splits((train, test), batch_size=2, device="cuda", shuffle=False, sort_key=lambda x : len(x.en), sort_within_batch=False) else : train_loader, test_loader = Iterator.splits((train, test), batch_size=self.config.batchsize, device="cuda", shuffle=False, sort_key=lambda x : len(x.en), sort_within_batch=False) return train_loader, test_loader
def dataset2iter(workpath=WORK_PATH, train_path=FILE_TRAIN, validation_path=FILE_VALID, test_path=FILE_TEST): fields = [('sentence', SENTENCE), ('wxx', LABEL), ('char', CHAR)] ######## 当你的数据行以 " 开头时,csv.reader 会认为此行包含分隔符,请额外传入以下参数。 ######## torchtext相关文档: https://pytorch.org/text/stable/data.html#torchtext.data.TabularDataset.__init__ ######## csv库相关文档: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters csv_reader_params = { 'doublequote': False, 'quoting': csv.QUOTE_NONE, } data_train = TabularDataset(workpath + train_path, format="tsv", fields=fields, skip_header=True, csv_reader_params=csv_reader_params) data_valid = TabularDataset(workpath + validation_path, format="tsv", fields=fields, skip_header=True, csv_reader_params=csv_reader_params) data_test = TabularDataset(workpath + test_path, format="tsv", fields=fields, skip_header=True, csv_reader_params=csv_reader_params) pretrained_vectors = Vectors(name=GLOVE_PATH + TRAINED_VECTORS + '.txt', cache=GLOVE_PATH) SENTENCE.build_vocab( data_train, vectors=pretrained_vectors, unk_init=lambda x: torch.nn.init.uniform_(x, a=-0.25, b=0.25)) LABEL.build_vocab(data_train) CHAR.build_vocab(data_train) # debug #return data_train,data_valid,data_test iter_train = Iterator(data_train, batch_size=BATCH_SIZE, train=True, sort_key=lambda x: len(x.sentence), shuffle=True, device=DEVICE) iter_valid = Iterator(data_valid, batch_size=BATCH_SIZE, train=False, sort=False, shuffle=True, device=DEVICE) iter_test = Iterator(data_test, batch_size=BATCH_SIZE, train=False, sort=False, shuffle=True, device=DEVICE) return iter_train, iter_valid, iter_test
def data_split(text_field, label_field, dataset, mode=False): if mode == 'init': for index, c in enumerate(dataset): partial = NLPDataLoader(c, text_field=text_field, label_field=label_field, test=False) if index == 0: text_field.build_vocab(partial) label_field.build_vocab(list(range(13))) else: text_counter = text_field.vocab.freqs for example in partial.examples: text_counter.update(example.text) text_field.vocab = text_field.vocab_cls( text_counter, specials=['<unk>', '<pad>']) return elif mode is False: dataset = NLPDataLoader(dataset, text_field=text_field, label_field=label_field, test=False) return Iterator.splits((dataset, ), batch_size=20) elif mode is True: dataset = NLPDataLoader(dataset, text_field=text_field, label_field=label_field, test=True) return Iterator.splits((dataset, ), batch_size=20, shuffle=False)
def fit(self, filepath, train_dev_ratio=0.8, batch_size=64, nepoch=10): """ Feed training data to train the model Args: filepath: a string, the path of dataset train_dev_ratio: a float, the ratio to split train and dev dataset batch_size: a integer, the size of batch when training nepoch: a integer, the number of training epochs TODO: 1) support early stopping 2) support customized delimiter 3) support callback function 4) support fit_generator """ train, dev = self._process_data(filepath, train_dev_ratio) self.text_field.build_vocab(train, vectors="glove.6B.50d") self.label_field.build_vocab(train) self._build_network() train_iter = Iterator(train, batch_size=batch_size, shuffle=True) dev_iter = Iterator(dev, batch_size=batch_size, shuffle=True) optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr) loss_fn = nn.BCELoss() self.network.train() best_acc = 0 for epoch in range(1, nepoch + 1): for i, batch in enumerate(train_iter): feature, target = batch.text, batch.label target = target.type(torch.FloatTensor) feature.data.t_(), target.data.sub_(1) optimizer.zero_grad() y_pred = self.network(feature).reshape(-1) loss = loss_fn(y_pred, target) loss.backward() optimizer.step() label_pred = (np.array(y_pred.data) > 0.5).astype(int) label_true = np.array(target) train_acc = accuracy_score(label_true, label_pred) output_str = '\rEpoch:{} batch:{} loss:{:.6f} acc:{:.2f}' sys.stdout.write( output_str.format(epoch, i, loss.item(), train_acc)) dev_acc, dev_p, dev_r, dev_f1 = self.evaluate(dev_iter) if dev_acc > best_acc: best_acc = dev_acc print('Saving best model:') output_str = '\nBest - acc:{:.2f} p:{:.2f} r:{:.2f} f1:{:.2f} \n \n' print(output_str.format(dev_acc, dev_p, dev_r, dev_f1)) self._save_weights(self.network) return
def __init__(self, module_name, train_bs, eval_bs, device, log): self.module_name = module_name # split_chars = lambda x: list("".join(x.split())) split_chars = lambda x: list(x) # keeps whitespaces source = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) target = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) log("Loading FULL datasets ...") folder = os.path.join(DATASET_TARGET_DIR, module_name) train_dataset, eval_dataset, _ = TranslationDataset.splits( path=folder, root=folder, exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING), fields=(source, target), train=TRAIN_FILE_NAME, validation=EVAL_FILE_NAME, test=EVAL_FILE_NAME) log("Building vocab ...") source.build_vocab(train_dataset) target.vocab = source.vocab log("Creating iterators ...") train_iterator = Iterator(dataset=train_dataset, batch_size=train_bs, train=True, repeat=True, shuffle=True, device=device) eval_iterator = Iterator(dataset=eval_dataset, batch_size=eval_bs, train=False, repeat=False, shuffle=False, device=device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.train_iterator = train_iterator self.eval_iterator = eval_iterator self.source = source self.target = target
def create_data_iterator(batch_size, tabular_train, tabular_valid, d): # Create the Iterator for datasets (Iterator works like dataloader) train_iter = Iterator(tabular_train, batch_size=batch_size, device=d, sort_within_batch=False, repeat=False) valid_iter = Iterator(tabular_valid, batch_size=batch_size, device=d, sort_within_batch=False, repeat=False) return train_iter, valid_iter
def get_dataloader(self, proc_id=0, n_gpus=1, device=torch.device('cpu'), batch_size=64): def _distribute_dataset(dataset): n = len(dataset) part = dataset[n * proc_id // n_gpus:n * (proc_id + 1) // n_gpus] return torchtext.data.Dataset(part, dataset.fields) train_ds = _distribute_dataset(self.train_ds) self.verbose = self.verbose and (proc_id == 0) train_iter, valid_iter = BucketIterator.splits( (train_ds, self.valid_ds), batch_sizes=(batch_size, batch_size), sort_within_batch=True, sort_key=lambda x: len(x.input), device=device, repeat=False, ) test_iter = Iterator( self.test_ds, batch_size=1, sort=False, sort_within_batch=False, device=device, repeat=False, ) train_dl = BatchWrapper(train_iter) valid_dl = BatchWrapper(valid_iter) test_dl = BatchWrapper(test_iter) return train_dl, valid_dl, test_dl
def build_prediction_iterator(self, df): dataset = base.CommentsDataset(df, self.fields) pred_id = list(df['id'].values) pred_iter = Iterator( dataset, batch_size=self.params['batch_size'], repeat=False, shuffle=False, sort=False) return pred_id, pred_iter
def eval(self, tasknum, total:int, trainloss:float=0, epoch:int=0): iter = Iterator(self.task[tasknum].te_dataset, batch_size=self.batch_size, repeat=False, sort_key=lambda x: len(x.syllable_contents), train=False, device=self.device) tq_iter = tqdm(enumerate(iter), total=math.ceil(total / self.batch_size), unit_scale=self.batch_size, bar_format='{r_bar}') pred_lst = list() truth_lst = list() acc_lst = list() label_lst = list() self.model.eval() for i, batch in tq_iter: preds = self.model(batch.syllable_contents, tasknum) label = torch.tensor(batch.label, dtype=torch.long, device=self.device) if(self.isbinary): accs = torch.eq(preds > 0.5, batch.label > 0.5).to(torch.float) else: accs = torch.eq(torch.argmax(preds, dim=1), label).to(torch.long) label_lst += label.tolist() acc_lst += accs.tolist() pred_lst += preds.tolist() prec,recall,f1score,f1s,rocauc = self.getscore(pred_lst,label_lst) accuracy = sum(acc_lst)/total self.log_to_c3dl(json.dumps( {'type': 'test', 'epoch': epoch, 'accuracy': accuracy, 'precision': prec, 'recall': recall, 'f1score': f1score, 'ROC-AUC':rocauc})) wandb.log({'Epoch':epoch,'Accuracy':accuracy,'Precision':prec,'Recall':recall,'F1Score':f1score, 'Trainloss':trainloss, 'ROC-AUC':rocauc}) if(len(f1s)!=0): for i in range(len(f1s)): wandb.log({f'Class {i} F1Score':f1s[i]}) return acc_lst, total, prec, recall, f1score, f1s, rocauc
def predict_and_save(dataset=None, model=None, dataset_path='dev.conll', out_path='predict.conll', **kwargs): """Combine original CONLL-X file with predictions. This is required since the iterator might have changed certain fields (e.g. lowercasing). We read the dataset_path separately and replace the fields we predicted. """ device = torch.device(type='cuda') if use_cuda else torch.device( type='cpu') data_iter = Iterator(dataset, 1, train=False, sort=False, shuffle=False, device=device) start_time = time.time() i2pos = dataset.fields['pos'].vocab.itos with open(out_path, mode='w', encoding='utf-8') as f: with open(dataset_path, mode='r', encoding='utf-8') as data_f: with torch.no_grad(): if "ud" in dataset_path: original_iter = read_conllu(data_f) ud = True i2upos = dataset.fields['upos'].vocab.itos else: original_iter = read_conllx(data_f) ud = False for pred in predict(data_iter=data_iter, model=model): tokens = next(original_iter) pred_tags = [-1] * len(tokens) pred_utags = [-1] * len(tokens) write_tag, write_utag = False, False if len(pred["pos"]) > 0: pred_tags = pred["pos"].data.view(-1).tolist() write_tag = True if len(pred["upos"]) > 0: pred_utags = pred["upos"].data.view(-1).tolist() write_utag = True for tok, pred_tag, pred_utag in \ zip(tokens, pred_tags, pred_utags): if write_tag: if ud: tok.xpos = i2pos[pred_tag] else: tok.pos = i2pos[pred_tag] if write_utag: tok.upos = i2upos[pred_utag] f.write(str(tok) + '\n') f.write('\n')
def get_iterator(dataset, device, batch_size, shuffle=True, repeat=False): train, val, test = dataset train_iter, val_iter = BucketIterator.splits( (train, val), batch_size=batch_size, device=device, sort_key=lambda x: len(x.comment_text), sort_within_batch=False, shuffle=shuffle, repeat=repeat) test_iter = Iterator(test, batch_size=batch_size, device=device, sort_within_batch=False, repeat=repeat, sort=False) train_dl = BatchWrapper(train_iter, "comment_text", [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]) valid_dl = BatchWrapper(val_iter, "comment_text", [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]) test_dl = BatchWrapper(test_iter, "comment_text", None) return train_dl, valid_dl, test_dl
def test_text_cnn(config): os.environ['CUDA_VISIBLE_DEVICES'] = str(config['gpu']) base_path = config['base_path'] save_path = os.path.join(base_path, 'text_cnn.pkl') vocab_path = os.path.join(base_path, 'vocab.pkl') TEXT = data.Field(sequential=True, lower=True, batch_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) fields = [ ('sentence', TEXT), ('label', LABEL) ] test_data = TabularDataset(path=os.path.join(base_path, 'test.tsv'), format='tsv', skip_header=True, fields=fields) with open(vocab_path, 'rb') as handle: vocab = pickle.load(handle) TEXT.vocab = vocab device = torch.device('cuda:0') test_iter = Iterator(test_data, batch_size=config['batch_size'], shuffle=False, device=device) model = torch.load(save_path) test_accuracy = eval_text_cnn(model, test_iter) print('test_accuracy: %.4f' % test_accuracy)
def evaluate(self, dev_data): """ evaluate the dev dataset Args: dev_data: torchtext.data.Iterator or torchtext.data.Dataset Returns: a tuple of (accuracy, precision, recall, f1_score) """ if isinstance(dev_data, Iterator): dev_iter = dev_data else: dev_iter = Iterator(dev_data, batch_size=32) self.network.eval() label_pred, label_true = [], [] for batch in dev_iter: feature, target = batch.text, batch.label target = target.type(torch.FloatTensor) # since the label is {unk:0, 0: 1, 1: 2}, need subtrct 1 feature.data.t_(), target.data.sub_(1) y_pred = self.network(feature) y_pred = y_pred.reshape(-1) label_pred += list((np.array(y_pred.data) > 0.5).astype(int)) label_true += list(np.array(target)) acc = accuracy_score(label_true, label_pred) p = precision_score(label_true, label_pred) r = recall_score(label_true, label_pred) f1 = f1_score(label_true, label_pred) output_str = '\nEval - acc:{:.2f} p:{:.2f} r:{:.2f} f1:{:.2f} \n' print(output_str.format(acc, p, r, f1)) return acc, p, r, f1
def __init__(self, batch_size): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.pad_id = self.tokenizer._convert_token_to_id("[PAD]") self.batch_size = batch_size # Objects in which the data will be stored. self.text = Field(sequential=True, lower=True, tokenize=self.tokenizer.tokenize, batch_first=True, pad_token='[PAD]', unk_token='[UNK]') self.labels = Field(sequential=False, is_target=True) self.train, self.dev, self.test = MultiNLI.splits( self.text, self.labels) # Builds vocabulary for the data. self.text.build_vocab(self.train, self.dev, self.test) self.labels.build_vocab(self.train) self.train_size = len(self.train) self.val_size = len(self.dev) self.test_size = len(self.test) self.name = 'mnli' # Standard torchtext iterators, these do not return input suitable for BERT. self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=self.batch_size, device=torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu'))
def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
def classify_from_strings(self, strings: Union[List[str], str]) -> list: """ method that can be used for classifying one or multiple examples with a trained classifier :param strings: a single string or a list of strings representing the pieces of text that should be classified :return: list containing the predictions of the models for the inputted pieces of text """ assert self.has_trained if isinstance(strings, str): strings = [strings] if isinstance(strings, list): strings = [[string] for string in strings] fields = [('text', self._TEXT)] list_of_examples = [Example.fromlist(string, fields) for string in strings] dataset = torchtext.data.Dataset(list_of_examples, fields) data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False, repeat=False, shuffle=False) predictions = [] for item in data: x = item.text self.model.to(self.device) self.model = self.model.eval() outputs = self.model([x[0].to(self.device), x[1].to(self.device)]) predictions.extend(outputs.detach().cpu().argmax(1).tolist()) results = [self._label_names[i] for i in predictions] return results
def __init__(self, batch_size): self.text = Field( lower=True, tokenize=lambda x: [tok.text for tok in spacy_en.tokenizer(x)], batch_first=True) self.label = Field(sequential=False, unk_token=None, is_target=True) self.train, self.dev, self.test = SNLI.splits(self.text, self.label) self.sizes = { 'train': len(self.train), 'val': len(self.dev), 'test': len(self.test) } self.text.build_vocab(self.train, self.dev) self.label.build_vocab(self.train) vector_cache_loc = '.vector_cache/snli_vectors.pt' if os.path.isfile(vector_cache_loc): self.text.vocab.vectors = torch.load(vector_cache_loc) else: self.text.vocab.load_vectors('glove.840B.300d') torch.save(self.text.vocab.vectors, vector_cache_loc) # Batching self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=batch_size, device='cuda:0' if torch.cuda.is_available() else 'cpu') self.vocab_size = len(self.text.vocab) self.out_dim = len(self.label.vocab) self.labels = self.label.vocab.stoi
def test_language_model(config: dict) -> None: os.environ['CUDA_VISIBLE_DEVICES'] = str(config['gpu']) base_path = config['base_path'] save_path = os.path.join(base_path, 'language_model.pkl') vocab_path = os.path.join(base_path, 'vocab.pkl') TEXT = data.Field(sequential=True, lower=True, batch_first=True, init_token=SOS, eos_token=EOS) fields = [('sentence', TEXT)] test_data = TabularDataset(path=os.path.join(base_path, 'test.tsv'), format='tsv', skip_header=True, fields=fields) with open(vocab_path, 'rb') as handle: vocab = pickle.load(handle) TEXT.vocab = vocab device = torch.device('cuda:0') test_iter = Iterator(test_data, batch_size=config['batch_size'], shuffle=False, device=device) model = torch.load(save_path) criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX) test_loss = eval_language_model(model, test_iter, criterion) print('test_loss: %.4f\ttest_ppl: %.4f' % (test_loss, 2**test_loss))
def __init__(self, dataset, batch_size, do_train, seed=1): super(BatchIterator, self).__init__() self.batch_size = batch_size self.do_train = do_train random.seed(seed) # We need different iterators for train and eval if self.do_train: iterator = BucketIterator(dataset=dataset, batch_size=self.batch_size, train=True, sort_key=lambda x: torchtext.data. interleave_keys(len(x.src), len(x.trg))) else: iterator = Iterator(dataset=dataset, batch_size=self.batch_size, sort=False, sort_within_batch=False, repeat=False) self.iterator = iterator self.num_batches = len(iterator) self.iter = iter(self.iterator)
def __init__(self, config): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.batch_size = config['batch_size'] self.pad_id = self.tokenizer._convert_token_to_id("[PAD]") # Objects in which the data will be stored. self.text = Field(sequential=True, lower=True, tokenize=self.tokenizer.tokenize, batch_first=True, pad_token='[PAD]', unk_token='[UNK]') self.labels = Field(sequential=False, is_target=True) self.train, self.dev, self.test = MultiNLI.splits( self.text, self.labels) # Builds vocabulary for the data. self.text.build_vocab(self.train, self.dev, self.test) self.labels.build_vocab(self.train) # Standard torchtext iterators, these do not return input suitable for BERT. self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=config['batch_size'], device=config['device'])
def create_iter(self, batch_size): """ 构建迭代器 :param batch_size: 每批的大小 :return: iter """ # 定义torchtext中的Field fields = [('english', self.english), ('chinese', self.chinese)] examples = [] # 构建中英文example for en, ch in zip(self.english_list, self.chinese_list): item = [en, ch] examples.append(data.Example().fromlist(item, fields)) # 划分训练集,测试集 train, test = Dataset(examples=examples, fields=fields).split(split_ratio=0.8) self.english.build_vocab(train) self.chinese.build_vocab(train) self.english_voca_size = len(self.english.vocab) self.chinese_voca_size = len(self.chinese.vocab) train_iter, test_iter = Iterator.splits( (train, test), batch_sizes=(batch_size, len(test)), sort_key=lambda x: len(x.english), sort_within_batch=True, device=-1) return train_iter, test_iter
def load_data(): TEXT, LABELS, LENGTH = Field(sequential=True, use_vocab=True), \ Field(sequential=True, use_vocab=False, preprocessing=lambda x: list(map(int, x)), pad_token=-1), \ Field(sequential=False, use_vocab=False) train_set = TabularDataset(path=DATA_DIR + 'train.tok.tsv', format='TSV', fields=[('text', TEXT), ('labels', LABELS), ('length', LENGTH)], skip_header=True) val_set = TabularDataset(path=DATA_DIR + 'dev.tok.tsv', format='TSV', fields=[('text', TEXT), ('labels', LABELS), ('length', LENGTH)], skip_header=True) train_loader = Iterator( dataset=train_set, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, shuffle=True, ) val_loader = BucketIterator( dataset=val_set, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=False, shuffle=False, ) TEXT.build_vocab(train_set) # LABELS.build_vocab(train_set) return (TEXT, LABELS, LENGTH), (train_set, val_set), (train_loader, val_loader)
def train(dataset_dir, emb_file, epoch, batch_size): (train_data, test_data, text_field, label_field) = dataset.load_data(dataset_dir, emb_file) class_size = len(label_field.vocab) nbow = nbow_model.NBoW(class_size, text_field.vocab.vectors) nbow.train() optimizer = torch.optim.Adam(nbow.parameters()) train_iter = Iterator(train_data, batch_size) for n in range(epoch): for batch in train_iter: optimizer.zero_grad() logit = nbow(batch.text.t()) loss = F.cross_entropy(logit, batch.label) loss.backward() optimizer.step() nbow.eval() (accuracy, num_correct) = compute_accuracy(nbow, test_data) print('Epoch: {} Accuracy: {:.2f}% ({}/{})'.format(n + 1, accuracy * 100, num_correct, len(test_data))) nbow.train()
def get_data_iter(train_csv, test_csv, fix_length): TEXT = data.Field(sequential=True, lower=True, fix_length=fix_length, batch_first=True) LABEL = data.Field(sequential=False, use_vocab=False) train_fields = [("label", LABEL), ("title", None), ("text", TEXT)] train = TabularDataset(path=train_csv, format="csv", fields=train_fields, skip_header=True) train_iter = BucketIterator(train, batch_size=batch_size, device=-1, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False) test_fields = [("label", LABEL), ("title", None), ("text", TEXT)] test = TabularDataset(path=test_csv, format="csv", fields=test_fields, skip_header=True) test_iter = Iterator(test, batch_size=batch_size, device=-1, sort=False, sort_within_batch=False, repeat=False) # vectors = Vectors(name=word2vec_dir) # TEXT.build_vocab(train, vectors=vectors) TEXT.build_vocab(train) vocab = TEXT.vocab return train_iter, test_iter, vocab
def test(args): train_data, dev_data, test_data, sentence_vocab, pred_arg_vocab, _ = read_data( args.path, args.train, args.test, args.dev) test_iter = Iterator(test_data, 1, sort_key=lambda x: len(x.sentence), train=False, repeat=False) model = torch.load(args.save_model) model.eval() instances_seen = 0 labels = [] outputs = [] with torch.no_grad(): for v_iteration, instance in enumerate(test_iter): model_outputs = model(instance) output = torch.sigmoid(model_outputs) outputs.append(output.item()) labels.append(instance.label[0].item()) pred = lambda x: 1 if x >= 0.15 else 0 predicted = [pred(x) for x in outputs] print(f1_score(labels, predicted))
def predict(self, model_name, filepath=Path('../data/output/sub.csv')): """ """ preds = [] sub_dataset = TabularDataset(filepath, format="CSV", fields=self.fields, skip_header=True) sub_iter = Iterator(sub_dataset, batch_size=self.batch_size, device=self.device, train=False, shuffle=False, sort=False) self.load_checkpoint(model_name) self.model.eval() with torch.no_grad(): for (label, text), _ in sub_iter: label = label.type(torch.LongTensor) label = label.to(self.device) text = text.type(torch.LongTensor) text = text.to(self.device) output = self.model(text, label) _, output = output preds.extend(torch.argmax(output, 1).tolist()) id_list = ["twitter_" + str(n) for n in range(1, len(sub_dataset) + 1)] label_list = [ "SARCASM" if pred == 1 else "NOT_SARCASM" for pred in preds ] df_sub = pd.DataFrame(list(zip(id_list, label_list)), columns=['id', 'label']) return df_sub
def predict_text_cnn(model_path, file_path, vocab_path, batch_size=64): TEXT = data.Field(sequential=True, lower=True, batch_first=True) fields = [('sentence', TEXT)] test_data = TabularDataset(path=file_path, format='tsv', skip_header=True, fields=fields) with open(vocab_path, 'rb') as handle: vocab = pickle.load(handle) TEXT.vocab = vocab device = torch.device('cuda:0') test_iter = Iterator(test_data, batch_size=batch_size, shuffle=False, device=device) model = torch.load(model_path) sentiments = [] model.eval() with torch.no_grad(): for batch in test_iter: sentence = batch.sentence logit = model(sentence) prob = torch.softmax(logit, dim=-1)[:, 1].tolist() sentiments.extend(prob) return sentiments
def get_iterator(dataset, batch_size, train=True): return Iterator(dataset, batch_size=batch_size, device=device, train=train, shuffle=train, sort=False)