def encode_sentence(model: nn.Module, sentence: str, field: Field, device: torch.device) -> torch.Tensor: tokens = field.tokenize(sentence) vocab = field.vocab sentence_indices = [ vocab.stoi[token] if token in vocab.stoi else vocab.stoi[UNK_TOKEN] for token in tokens ] sentence_numerical = torch.LongTensor([sentence_indices]) sentence_numerical = sentence_numerical.to(device) # batch_len x seq_len sentence_numerical = sentence_numerical.view(-1, 1) with torch.no_grad(): z = model.encode(sentence_numerical, torch.LongTensor([sentence_numerical.size(1)]), use_mean=True)['code'] return z
def __init__(self, text_field: Field, label_field: Field, score_field: Field, lexicon, dataset='sst-2'): fields = [('text', text_field), ('score', score_field), ('label', label_field)] # tokenize later text_tokenizer = identity text_tokenizer, text_field.tokenize = text_field.tokenize, text_tokenizer tokenizer = get_tokenizer(score_field.tokenize) if lexicon == 'wordnet': net = SentiWordNet(default=-1, exclude_stop_words=False) else: net = Sentiment(lexicon, default=-1) def build_score(text): texts = tokenizer(text) return [net[tok] for tok in texts] def build_tag_score(text): texts = tokenizer(text) tags = nltk.pos_tag(texts) return [net.get_score(tok, tag) for tok, tag in tags] score_field.tokenize = identity if 'sst' in dataset: phases = torchtext.datasets.SST.splits(text_field=text_field, label_field=label_field, fine_grained=True, root=const.data_path) if dataset == 'sst-2': mapping = {'very positive': 1, 'positive': 1, 'negative': 0, 'very negative': 0} else: mapping = {'very positive': 0, 'positive': 1, 'negative': 2, 'very negative': 3, 'neutral': 4} elif dataset == 'imdb': phases = torchtext.datasets.IMDB.splits(text_field=text_field, label_field=label_field, root=const.data_path) mapping = { 'pos': 1, 'neg': 0 } else: raise LookupError self.n_class = len(set(mapping.values())) label_field.preprocessing = None text_field.tokenize = text_tokenizer score_field.tokenize = build_tag_score if lexicon == 'wordnet' else build_score def process(sample): return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields) pool = ProcessPoolExecutor() self.dataset = [] for i, phase in enumerate(phases): examples = [] rets = [] for example in phase.examples: if example.label in mapping.keys(): ret = process(example) examples.append(ret) # for ret in tqdm(rets): # ex = ret.result() # examples.append(ex) self.dataset.append(Dataset(examples, fields))
def _dynamic_dict( example: dict, src_types: List[str], src_types_fields: Dict[str, Field], tgt_field: Field, ) -> Tuple[Vocab, dict]: """Create copy-vocab and numericalize with it. In-place adds ``"src_map"`` to ``example``. That is the copy-vocab numericalization of the tokenized ``example["src"]``. If ``example`` has a ``"tgt"`` key, adds ``"alignment"`` to example. That is the copy-vocab numericalization of the tokenized ``example["tgt"]``. The alignment has an initial and final UNK token to match the BOS and EOS tokens. Args: example (dict): An example dictionary with a ``"src"`` key and maybe a ``"tgt"`` key. (This argument changes in place!) src_field (torchtext.data.Field): Field object. tgt_field (torchtext.data.Field): Field object. Returns: torchtext.data.Vocab and ``example``, changed as described. """ # src_ex_vocab_list = list() unk = None pad = None src_counter: Counter = collections.Counter() for src_type in src_types: src = src_types_fields[src_type].tokenize(example[f"src.{src_type}"]) # add into counter src_counter.update(src) # update or match unk, pad unk_ = src_types_fields[src_type].unk_token if unk is None: unk = unk_ else: assert unk == unk_ # end if pad_ = src_types_fields[src_type].pad_token if pad is None: pad = pad_ else: assert pad == pad_ # end if # end for # Build src_ex_vocab (shared among all srcs) src_ex_vocab = Vocab(src_counter, specials=[unk, pad]) unk_idx = src_ex_vocab.stoi[unk] # Map source tokens to indices in the dynamic dict. for src_type in src_types: src = src_types_fields[src_type].tokenize(example[f"src.{src_type}"]) src_map = torch.LongTensor([src_ex_vocab.stoi[w] for w in src]) example[f"src_map.{src_type}"] = src_map # end for example[f"src_ex_vocab"] = src_ex_vocab if "tgt" in example: tgt = tgt_field.tokenize(example["tgt"]) mask = torch.LongTensor( [unk_idx] + [src_ex_vocab.stoi[w] for w in tgt] + [unk_idx]) example["alignment"] = mask # end if return src_ex_vocab, example
outputs = model(inp, target) loss = criterion(outputs[0].view(-1, len(src_field.vocab)), target.view(-1)) loss.backward() # torch.nn.utils.clip_grad_norm(model.parameters(), 2.0) optimizer.step() epoch_loss += loss.item() print(i, epoch_loss) # In[ ]: source_text = [ "manual and gaze input cascaded ( magic ) pointing . this work explores a new direction in utilizing eye gaze for computer input . gaze tracking has long been considered as an alternative or potentially superior pointing method for computer input . we believe that many fundamental limitations exist with traditional gaze pointing . in particular , it is unnatural to overload a perceptual channel such as vision with a motor control task . we therefore propose an alternative approach , dubbed magic ( manual and gaze input cascaded ) pointing . with such an approach , pointing appears to the user to be a manual task , used for fine manipulation and selection . however , a large portion of the cursor movement is eliminated by warping the cursor to the eye gaze area , which encompasses the target . two specific magic pointing techniques , one conservative and one liberal , were designed , analyzed , and implemented with an eye tracker we developed . they were then tested in a pilot study . this early stage exploration showed that the magic pointing techniques might offer many advantages , including reduced physical effort and fatigue as compared to traditional manual pointing , greater accuracy and naturalness than traditional gaze pointing , and possibly faster speed than manual pointing . the pros and cons of the two techniques are discussed in light of both performance data and subjective reports" ] inp = src_field.tokenize(source_text[0]) inp = src_field.numericalize([inp]).to(device) # In[ ]: result = [] enc_output = model.encoder(inp) # In[ ]: res = model.decoder.infer_rnn_auto_regressive( encoder_output_dict=enc_output, vocab=src_field.vocab, length=3).view(-1).detach().cpu().numpy() # In[2]:
test_videos = np.load('C:/Dataset/' + data + '/test_videos.npy') test_videos = [test_videos[i].item() for i in range(len(test_videos))] test_captions = np.load('C:/Dataset/' + data + '/test_captions.npy') test_captions = [test_captions[i].item() for i in range(len(test_captions))] len(train_videos),len(train_captions), len(test_videos),len(test_captions) import spacy import torchtext from torchtext.data import Field, BucketIterator, TabularDataset en = spacy.load('en') EN_TEXT = Field(init_token='<sos>', eos_token='<eos>', tokenize=lambda captions : [ [tok.text for tok in en.tokenizer(sentence)] for sentence in captions], batch_first = True) EN_TEXT.build_vocab(EN_TEXT.tokenize(train_captions)) len(EN_TEXT.vocab.stoi) from collections import defaultdict train_references = defaultdict(list) for i in range(len(train_captions)): train_references[train_videos[i]].append(train_captions[i].split()) test_references = defaultdict(list) for i in range(len(test_captions)): test_references[test_videos[i]].append(test_captions[i].split()) len(train_references), len(test_references) from torch.utils.data import Dataset