def _csv_iterator(data_path, ngrams, yield_cls=False, label=-1): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f, delimiter="\t") for row in reader: tokens = ' '.join([row[5]]) #print(row[5]) tokens = tokenizer(tokens) if yield_cls: yield row[7], ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def build_legacy_torchtext_vocab_pipeline(vocab_file): tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator def token_iterator(vocab_file): f = open(vocab_file, 'r') for line in f: for token in line: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) pipeline = sequential_transforms(tokenizer, vocab_func(vocab)) return pipeline, None, None
def BookCorpus(vocab, tokenizer=get_tokenizer("basic_english"), data_select=('train', 'test', 'valid'), removed_tokens=[], min_sentence_len=None): if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'test', 'valid'))): raise TypeError('data_select is not supported!') extracted_files = glob.glob('/datasets01/bookcorpus/021819/*/*.txt') random.seed(1000) random.shuffle(extracted_files) num_files = len(extracted_files) _path = { 'train': extracted_files[:(num_files // 20 * 17)], 'test': extracted_files[(num_files // 20 * 17):(num_files // 20 * 18)], 'valid': extracted_files[(num_files // 20 * 18):] } data = {} for item in _path.keys(): data[item] = [] logging.info('Creating {} data'.format(item)) tokens = [] for txt_file in _path[item]: with open(txt_file, 'r', encoding="utf8", errors='ignore') as f: for line in f.readlines(): _tokens = tokenizer(line.strip()) if min_sentence_len: if len(_tokens) >= min_sentence_len: tokens.append( [vocab.stoi[token] for token in _tokens]) else: tokens += [vocab.stoi[token] for token in _tokens] data[item] = tokens for key in data_select: if data[key] == []: raise TypeError('Dataset {} is empty!'.format(key)) if min_sentence_len: return tuple( LanguageModelingDataset(data[d], vocab, lambda x: x, False) for d in data_select) else: return tuple( LanguageModelingDataset( torch.tensor(data[d]).long(), vocab, lambda x: x, False) for d in data_select)
def __init__(self, texts, labels, embed_dim, ngrams=3, num_epochs=5, seed=0): # set seed np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) self.texts = texts self.labels = labels self.embed_dim = embed_dim self.ngrams = ngrams # construct vocab print('Constructing vocabulary...') self.vocab = construct_vocab(texts, ngrams) self.vocab_size = len(self.vocab) # prepare dataset print('Preparing dataset...') self.train_dataset = make_torchdataset(self.vocab, texts, labels, ngrams) self.num_classes = len(self.train_dataset.get_labels()) # prepare device ref and model self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = TextClassificationModel(self.vocab_size, self.embed_dim, self.num_classes).to(self.device) # loss function & optimization self.criterion = torch.nn.CrossEntropyLoss().to(self.device) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=4.0) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9) self.batch_size = 16 self.tokenizer = get_tokenizer("basic_english") self.ngrams = ngrams if num_epochs > 0: print('Training model...') self.train(self.train_dataset, num_epochs)
def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'test', 'valid'))): raise TypeError('data_select is not supported!') if dataset_name == 'PennTreebank': extracted_files = [] select_to_index = {'train': 0, 'test': 1, 'valid': 2} extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], root=root) for key in data_select] else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = [os.path.join(root, d) for d in extract_archive(dataset_tar)] _path = {} for item in data_select: _path[item] = _get_datafile_path(item, extracted_files) if vocab is None: if 'train' not in _path.keys(): raise TypeError("Must pass a vocab if train is not selected.") logging.info('Building Vocab based on {}'.format(_path['train'])) txt_iter = iter(tokenizer(row) for row in io.open(_path['train'], encoding="utf8")) vocab = build_vocab_from_iterator(txt_iter) logging.info('Vocab has {} entries'.format(len(vocab))) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") data = {} for item in _path.keys(): data[item] = [] logging.info('Creating {} data'.format(item)) txt_iter = iter(tokenizer(row) for row in io.open(_path[item], encoding="utf8")) _iter = numericalize_tokens_from_iterator( vocab, txt_iter, removed_tokens) for tokens in _iter: data[item] += [token_id for token_id in tokens] for key in data_select: if data[key] == []: raise TypeError('Dataset {} is empty!'.format(key)) return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) for d in data_select)
def _load_model(): # First load into memory the variables that we will need to predict checkpoint_path = pathlib.Path(__file__).parent.absolute() / "state_dict.pt" checkpoint = torch.load(checkpoint_path) global VOCAB, MODEL, NGRAMS, TOKENIZER VOCAB = checkpoint["vocab"] # TODO load the model. You can get `embed_dim` and `num_class` from the checkpoint. # TODO Then, load the state dict of the model MODEL = ... MODEL... NGRAMS = checkpoint["ngrams"] TOKENIZER = get_tokenizer("basic_english")
def build_legacy_pytext_vocab_pipeline(vocab_file): from pytext.data.utils import Vocabulary tokenizer = get_tokenizer("basic_english") f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = sequential_transforms(tokenizer_func(tokenizer), PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>"))) return pipeline, None, None
def prepare_data(device='cpu', train_batch_size=20, eval_batch_size=20): TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), init_token='<sos>', eos_token='<eos>', lower=True) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT) TEXT.build_vocab(train_txt) device = torch.device(device) train_data = batchify(train_txt, train_batch_size, TEXT, device) val_data = batchify(val_txt, eval_batch_size, TEXT, device) test_data = batchify(test_txt, eval_batch_size, TEXT, device) return train_data, val_data, test_data
def get_accuracy(ps_rref, data_dir, test_batch_size, job_name, target_loss): logger = Logger( job_name=job_name, file_dir=f'./measurement/logs/{job_name}_tester.log').logger train_iter = WikiText2(root=data_dir, split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) bptt = 35 train_iter, val_iter, test_iter = WikiText2(root=data_dir) val_data = data_process(val_iter, vocab, tokenizer) val_data = batchify(val_data, test_batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() t0 = time.time() logger.info("Start!") init = t0 while True: t1 = time.time() if t1 - t0 > 20: t0 = t1 m = ps_rref.rpc_sync().get_model().to(device) test_loss = 0. with torch.no_grad(): hidden = m.init_hidden(test_batch_size) for batch_idx, i in enumerate( range(0, val_data.size(0) - 1, bptt)): data, targets = get_batch(val_data, i, bptt) data, targets = data.to(device), targets.to(device) hidden = repackage_hidden(hidden) output, hidden = m(data, hidden) loss = criterion(output, targets) test_loss += len(data) * loss.item() test_loss /= (len(val_data) - 1) logger.info("Test Loss: {:7.3f} | Time: {:7.2f} seconds".format( test_loss, (t1 - init))) if test_loss < target_loss: ps_rref.rpc_sync().stop() break
def build_batch_torchtext_vocab(vocab_file): from torchtext.data.utils import get_tokenizer tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator from transforms import TextClassificationPipeline def token_iterator(vocab_file): f = open(vocab_file, 'r') for token in f: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) text_pipeline = TextDataPipeline(tokenizer, partial(map, vocab)) label_pipeline = int return TextClassificationPipeline(label_pipeline, text_pipeline), None
def _csv_iterator(data_path, ngrams, yield_cls=False): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: tokens = ' '.join(row[1:]) # print(tokens) tokens = tokenizer(tokens) # print(tokens) if yield_cls: yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def build_legacy_batch_torchtext_vocab_pipeline(vocab_file): tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator from transforms import TextClassificationPipeline def token_iterator(vocab_file): f = open(vocab_file, 'r') for line in f: for token in line: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) text_pipeline = sequential_transforms(tokenizer, vocab_func(vocab)) label_pipeline = totensor(dtype=torch.long) return TextClassificationPipeline(label_pipeline, text_pipeline), None, None
def basic_english(): """ Basic english tokenizer We use character level tokenizer in this experiment. You can switch by setting, ``` 'tokenizer': 'basic_english', ``` as the configurations dictionary when starting the experiment. """ return get_tokenizer('basic_english')
def get_data(device): TEXT = torchtext.data.Field( tokenize=get_tokenizer("basic_english"), init_token="<sos>", eos_token="<eos>", lower=True ) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT) TEXT.build_vocab(train_txt) ntokens = len(TEXT.vocab.stoi) batch_size = 500 eval_batch_size = 200 train_data = batchify(train_txt, batch_size, TEXT, device) val_data = batchify(val_txt, eval_batch_size, TEXT, device) test_data = batchify(test_txt, eval_batch_size, TEXT, device) return ntokens, train_data, val_data, test_data
def get_src_trg(tokenize=True): if tokenize == False: SRC = Field(sequential = False, init_token = '<sos>', eos_token = '<eos>', lower = False) TRG = Field(sequential = False, init_token = '<sos>', eos_token = '<eos>', lower = False) else: SRC = Field(tokenize = get_tokenizer("spacy"), init_token = '<sos>', eos_token = '<eos>', lower = False) TRG = Field(tokenize = get_tokenizer("spacy"), init_token = '<sos>', eos_token = '<eos>', lower = False) return SRC, TRG
def build_torchtext_vocab(vocab_file): from torchtext.data.utils import get_tokenizer tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator from torchtext.experimental.functional import totensor, vocab_func, sequential_transforms def token_iterator(vocab_file): f = open(vocab_file, 'r') for token in f: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) pipeline = sequential_transforms(tokenizer, vocab_func(vocab), totensor(dtype=torch.long)) return pipeline, None, None
def predict(self, text: str): time_started = time() tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text_tensor = torch.tensor([ self.vocab[token] for token in ngrams_iterator(tokenizer(text), NGRAMS) ]) output_tensor = self.model(text_tensor, torch.tensor([0])) output = output_tensor.argmax(1).item() elapsed = time() - time_started logger.info( f"ModelWrapper.predict: [elapsed {elapsed:.2f}s]: " f"len(text)={len(text)}, len(tokens)={len(text_tensor)}, answer={output}" ) return output
def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35): self.bptt = bptt train_iter = WikiText2(split='train') self.tokenizer = get_tokenizer('basic_english') counter = Counter() for line in train_iter: counter.update(self.tokenizer(line)) self.vocab = Vocab(counter) train_iter, val_iter, test_iter = WikiText2() train_data = self.data_process(train_iter) val_data = self.data_process(val_iter) test_data = self.data_process(test_iter) self.train_data = self.batchify(train_data, train_batch_size) self.val_data = self.batchify(val_data, eval_batch_size) self.test_data = self.batchify(test_data, eval_batch_size)
def constructVocab(news_file_train, news_file_test, attrs, save_path): """ Build field using torchtext for tokenization Returns: torchtext.vocabulary """ tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator( news_token_generator(news_file_train, news_file_test, tokenizer, attrs)) output = open(save_path, 'wb') pickle.dump(vocab, output) output.close()
def build_vocab(data_path, data_name, jsons, threshold, lang='en'): """Build vocabulary""" counter = Counter() for path in jsons[data_name]: full_path = os.path.join(os.path.join(data_path, data_name), path) if data_name == 'f8k': captions = cap_from_flickr_json(full_path) tokenizer = get_tokenizer('spacy', language=lang) for i, caption in enumerate(captions): counter.update(tokenizer(caption)) if i % 1000 == 0: print("[%d/%d] tokenized the captions." % (i, len(captions))) # Create vocabulary with words that has a number of occurence higher than threshold vocab = Vocab(counter, min_freq=threshold, specials=('<unk>', '<pad>', '<start>', '<end>')) return vocab
def predict(text, model, vocab, ngrams): tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([ vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams) ]) output = model(text, torch.tensor([0])) ret = output > THRESHOLD result = [] print(ret) cnt = 0 for r in ret[0]: if r: result.append(cnt) cnt += 1 return result
def _dataset_parser(self, inputdataset) -> Tuple[List, List]: """01. Token -> ID conversion pipelines""" def check_token(x): for token in x: if token not in self.vocabulary.keys(): self.vocabulary[token] = len(self.vocabulary)+1 return self.vocabulary def sample_pipeline(x): vocab = check_token(x) return [vocab[token] for token in x] #return [self.vocabulary[token] for token in x] def label_pipeline(x): return int(x)-1 tokenizer = get_tokenizer('basic_english') samples, labels = [], [] for (label,line) in inputdataset: tokens = tokenizer(line) if len(tokens) > 249: continue for pad in range(0, 250-len(tokens)): tokens.append('PAD') word_embedding = sample_pipeline(tokens) # for pad in range(0, 250-len(word_embedding)): # word_embedding.append(0) current_sample = torch.tensor(word_embedding, dtype=torch.int64) samples.append(current_sample) label_embedding = label_pipeline(label) current_label = torch.tensor(label_embedding, dtype=torch.int64) labels.append(current_label) return samples, labels
def _pd_iterator(data_to_parse: np.ndarray, ngrams: int, yield_cls: bool = False): """ :param data_to_parse: array of two colums with label and text :param ngrams: amount of ngrams :param yield_cls: return text with label or without :return: generator needed in future parsing for torch """ tokenizer = get_tokenizer(None) for row_id in range(len(data_to_parse)): tokens = data_to_parse[row_id][1] tokens = tokenizer(tokens) if yield_cls: yield data_to_parse[row_id][0], ngrams_iterator(tokens, ngrams) else: yield ngrams_iterator(tokens, ngrams)
def __init__( self, sequential=True, use_vocab=True, init_token=None, eos_token=None, fix_length=None, dtype=torch.long, preprocessing=None, postprocessing=None, lower=False, tokenize=None, tokenizer_language="en", include_lengths=False, batch_first=False, pad_token="<pad>", unk_token="<unk>", pad_first=False, truncate_first=False, stop_words=None, is_target=False, ): self.sequential = sequential self.use_vocab = use_vocab self.init_token = init_token self.eos_token = eos_token self.unk_token = unk_token self.fix_length = fix_length self.dtype = dtype self.preprocessing = preprocessing self.postprocessing = postprocessing self.lower = lower # store params to construct tokenizer for serialization # in case the tokenizer isn't picklable (e.g. spacy) self.tokenizer_args = (tokenize, tokenizer_language) self.tokenize = get_tokenizer(tokenize, tokenizer_language) self.include_lengths = include_lengths self.batch_first = batch_first self.pad_token = pad_token if self.sequential else None self.pad_first = pad_first self.truncate_first = truncate_first try: self.stop_words = set(stop_words) if stop_words is not None else None except TypeError: raise ValueError("Stop words must be convertible to a set") self.is_target = is_target
def make_torch_vocab(torch_text_path, corpus_type, min_freq=None): """Leveraging torch text experimental functions. :param torch_text_path: string, file path to torchtext file :param corpus_type: string, Required. One of 'train', 'valid', 'test' :param min_freq: token counter min frequency threshold, if None -> 1. :return: vocabulary source: https://github.com/pytorch/text/blob/master/torchtext/experimental/vocab.py """ logging.info('Starting make_torch_vocab()') if min_freq is None: min_freq = 1 files = os.listdir(torch_text_path) if all([".tokens" in i for i in files]): logging.info( f'Found {corpus_type} .token files.\n' f'\tReturning copra from disk instead of downloading them.\n' f'\tTo force new download, delete or rename these files:\n' f'\t{files}') tokenizer = get_tokenizer('basic_english') vocabulary = {} for file in files: file_path = os.sep.join([torch_text_path, file]) counter = Counter() f = open(file_path, 'r') for line in f: counter.update(tokenizer(line)) v = Vocab(counter, min_freq=min_freq) key = 'train' if '.train.' in file else 'test' if '.test.' in file else 'valid' vocabulary.update({key: v}) f.close() logging.info(f'Completed parsing vocab for {corpus_type}.') for k, v in vocabulary.items(): logging.info(f'Dataset {k}: with vocabulary of length: {len(v)}.') return vocabulary
def get_datasets(args): download = True if hvd.local_rank() == 0 else False if not download: hvd.allreduce(torch.tensor(1), name="barrier") args.dir = os.path.join(args.dir, args.dataset) os.makedirs(args.dir, exist_ok=True) tokenizer = get_tokenizer("basic_english") if args.dataset == 'wikitext2': WikiText = datasets.WikiText2 elif args.dataset == 'wikitext103': WikiText = datasets.WikiText103 train_data, val_data, test_data = WikiText(tokenizer=tokenizer, root=args.dir) if args.verbose: print("") if download: hvd.allreduce(torch.tensor(1), name="barrier") ntokens = len(train_data.get_vocab()) batch_size = args.batch_size * (args.bptt + 1) def collate(data): data = torch.stack(data) source = data.view(args.batch_size, -1).contiguous() data = source[:, :-1] target = source[:, 1:].contiguous().view(-1) return data, target torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_data, batch_size=batch_size, collate_fn=collate, drop_last=True, sampler=train_sampler, shuffle=False, **kwargs) val_sampler = torch.utils.data.distributed.DistributedSampler( val_data, num_replicas=hvd.size(), rank=hvd.rank()) val_loader = torch.utils.data.DataLoader( val_data, batch_size=batch_size, collate_fn=collate, drop_last=True, sampler=val_sampler, shuffle=False, **kwargs) test_sampler = torch.utils.data.distributed.DistributedSampler( test_data, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader( test_data, batch_size=batch_size, collate_fn=collate, drop_last=True, sampler=test_sampler, shuffle=False, **kwargs) return (train_sampler, train_loader), (val_sampler, val_loader), \ (test_sampler, test_loader), ntokens
def iterator(start, num_lines): tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for i, row in enumerate(reader): if i == start: break for _ in range(num_lines): tokens = ' '.join(row[1:]) tokens = ngrams_iterator(tokenizer(tokens), ngrams) yield int(row[0]) - 1, torch.tensor( [vocab[token] for token in tokens]) try: row = next(reader) except StopIteration: f.seek(0) reader = unicode_csv_reader(f) row = next(reader)
def _setup_datasets(dataset_name, tokenizer, root, vocab, data_select, single_line, year, language): if tokenizer is None: tokenizer = get_tokenizer('basic_english') data_select = check_default_set(data_select, ('train', 'test', 'valid')) if not single_line and dataset_name != 'WikiText103': raise TypeError('single_line must be True except for WikiText103') if vocab is None: if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") if dataset_name == 'WMTNewsCrawl': raw_train, = raw.DATASETS[dataset_name](root=root, data_select=('train', ), year=year, language=language) else: raw_train, = raw.DATASETS[dataset_name](root=root, data_select=('train', )) logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_train, tokenizer) logger_.info('Vocab has %d entries', len(vocab)) def text_transform(line): return torch.tensor([vocab[token] for token in tokenizer(line)], dtype=torch.long) if dataset_name == 'WMTNewsCrawl': raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select, year=year, language=language) else: raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) raw_data = { name: list(map(text_transform, raw_dataset)) for name, raw_dataset in zip(data_select, raw_datasets) } logger_.info('Building datasets for {}'.format(data_select)) return tuple( LanguageModelingDataset(raw_data[item], vocab, text_transform, single_line) for item in data_select)
def _setup_datasets(dataset_name, root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev')): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'dev'))): raise TypeError( 'Given data selection {} is not supported!'.format(data_select)) train, dev = raw.DATASETS[dataset_name](root=root) raw_data = { 'train': [item for item in train], 'dev': [item for item in dev] } if vocab is None: if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): for (_context, _question, _answers, _ans_pos) in data: tok_ans = [] for item in _answers: tok_ans += text_transform(item) yield text_transform(_context) + text_transform( _question) + tok_ans vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), len(raw_data['train'])) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = { 'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long) } return tuple( QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in data_select)
def predict(text, model, dictionary, ngrams): r""" The predict() function here is used to test the model on a sample text. The input text is numericalized with the vocab and then sent to the model for inference. Args: text: a sample text string model: the trained model dictionary: a vocab object for the information of string-to-index ngrams: the number of ngrams. """ tokenizer = get_tokenizer("basic_english") with torch.no_grad(): text = torch.tensor([ dictionary[token] for token in ngrams_iterator(tokenizer(text), ngrams) ]) output = model(text, torch.tensor([0])) return output.argmax(1).item() + 1