Exemplos de Field.tokenize em Python, exemplos de torchtext.data.Field.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: interpolate.py Projeto: APodolskiy/SentenceVAE

def encode_sentence(model: nn.Module, sentence: str, field: Field,
                    device: torch.device) -> torch.Tensor:
    tokens = field.tokenize(sentence)
    vocab = field.vocab
    sentence_indices = [
        vocab.stoi[token] if token in vocab.stoi else vocab.stoi[UNK_TOKEN]
        for token in tokens
    ]
    sentence_numerical = torch.LongTensor([sentence_indices])
    sentence_numerical = sentence_numerical.to(device)
    # batch_len x seq_len
    sentence_numerical = sentence_numerical.view(-1, 1)
    with torch.no_grad():
        z = model.encode(sentence_numerical,
                         torch.LongTensor([sentence_numerical.size(1)]),
                         use_mean=True)['code']
    return z

Exemplo n.º 2

0

Exibir arquivo

Arquivo: internal.py Projeto: liuxr98/torch_sentiment_analysis

    def __init__(self, text_field: Field, label_field: Field, score_field: Field, lexicon, dataset='sst-2'):
        fields = [('text', text_field), ('score', score_field), ('label', label_field)]

        # tokenize later
        text_tokenizer = identity
        text_tokenizer, text_field.tokenize = text_field.tokenize, text_tokenizer
        tokenizer = get_tokenizer(score_field.tokenize)

        if lexicon == 'wordnet':
            net = SentiWordNet(default=-1, exclude_stop_words=False)
        else:
            net = Sentiment(lexicon, default=-1)

        def build_score(text):
            texts = tokenizer(text)
            return [net[tok] for tok in texts]

        def build_tag_score(text):
            texts = tokenizer(text)
            tags = nltk.pos_tag(texts)
            return [net.get_score(tok, tag) for tok, tag in tags]

        score_field.tokenize = identity

        if 'sst' in dataset:
            phases = torchtext.datasets.SST.splits(text_field=text_field, label_field=label_field,
                                                   fine_grained=True, root=const.data_path)
            if dataset == 'sst-2':
                mapping = {'very positive': 1, 'positive': 1, 'negative': 0, 'very negative': 0}
            else:
                mapping = {'very positive': 0, 'positive': 1, 'negative': 2, 'very negative': 3, 'neutral': 4}
        elif dataset == 'imdb':
            phases = torchtext.datasets.IMDB.splits(text_field=text_field, label_field=label_field,
                                                    root=const.data_path)
            mapping = {
                'pos': 1,
                'neg': 0
            }
        else:
            raise LookupError

        self.n_class = len(set(mapping.values()))
        label_field.preprocessing = None
        text_field.tokenize = text_tokenizer
        score_field.tokenize = build_tag_score if lexicon == 'wordnet' else build_score

        def process(sample):
            return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields)

        pool = ProcessPoolExecutor()
        self.dataset = []
        for i, phase in enumerate(phases):
            examples = []
            rets = []
            for example in phase.examples:
                if example.label in mapping.keys():
                    ret = process(example)
                    examples.append(ret)
            # for ret in tqdm(rets):
            #     ex = ret.result()
            #     examples.append(ex)
            self.dataset.append(Dataset(examples, fields))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: MultiSourceAPDataset.py Projeto: mfkiwl/hdlp

def _dynamic_dict(
        example: dict,
        src_types: List[str],
        src_types_fields: Dict[str, Field],
        tgt_field: Field,
) -> Tuple[Vocab, dict]:
    """Create copy-vocab and numericalize with it.

    In-place adds ``"src_map"`` to ``example``. That is the copy-vocab
    numericalization of the tokenized ``example["src"]``. If ``example``
    has a ``"tgt"`` key, adds ``"alignment"`` to example. That is the
    copy-vocab numericalization of the tokenized ``example["tgt"]``. The
    alignment has an initial and final UNK token to match the BOS and EOS
    tokens.

    Args:
        example (dict): An example dictionary with a ``"src"`` key and
            maybe a ``"tgt"`` key. (This argument changes in place!)
        src_field (torchtext.data.Field): Field object.
        tgt_field (torchtext.data.Field): Field object.

    Returns:
        torchtext.data.Vocab and ``example``, changed as described.
    """
    # src_ex_vocab_list = list()
    unk = None
    pad = None
    src_counter: Counter = collections.Counter()

    for src_type in src_types:
        src = src_types_fields[src_type].tokenize(example[f"src.{src_type}"])

        # add into counter
        src_counter.update(src)

        # update or match unk, pad
        unk_ = src_types_fields[src_type].unk_token
        if unk is None:
            unk = unk_
        else:
            assert unk == unk_
        # end if

        pad_ = src_types_fields[src_type].pad_token
        if pad is None:
            pad = pad_
        else:
            assert pad == pad_
        # end if
    # end for

    # Build src_ex_vocab  (shared among all srcs)
    src_ex_vocab = Vocab(src_counter, specials=[unk, pad])
    unk_idx = src_ex_vocab.stoi[unk]

    # Map source tokens to indices in the dynamic dict.
    for src_type in src_types:
        src = src_types_fields[src_type].tokenize(example[f"src.{src_type}"])
        src_map = torch.LongTensor([src_ex_vocab.stoi[w] for w in src])
        example[f"src_map.{src_type}"] = src_map
    # end for

    example[f"src_ex_vocab"] = src_ex_vocab

    if "tgt" in example:
        tgt = tgt_field.tokenize(example["tgt"])
        mask = torch.LongTensor(
            [unk_idx] + [src_ex_vocab.stoi[w] for w in tgt] + [unk_idx])
        example["alignment"] = mask
    # end if
    return src_ex_vocab, example

Exemplo n.º 4

0

Exibir arquivo

Arquivo: seq2seq_model.py Projeto: roshan5395/Key-Phrase-Generation

        outputs = model(inp, target)
        loss = criterion(outputs[0].view(-1, len(src_field.vocab)),
                         target.view(-1))
        loss.backward()
        # torch.nn.utils.clip_grad_norm(model.parameters(), 2.0)
        optimizer.step()

        epoch_loss += loss.item()
    print(i, epoch_loss)

# In[ ]:

source_text = [
    "manual and gaze input cascaded ( magic ) pointing . this work explores a new direction in utilizing eye gaze for computer input . gaze tracking has long been considered as an alternative or potentially superior pointing method for computer input . we believe that many fundamental limitations exist with traditional gaze pointing . in particular , it is unnatural to overload a perceptual channel such as vision with a motor control task . we therefore propose an alternative approach , dubbed magic ( manual and gaze input cascaded ) pointing . with such an approach , pointing appears to the user to be a manual task , used for fine manipulation and selection . however , a large portion of the cursor movement is eliminated by warping the cursor to the eye gaze area , which encompasses the target . two specific magic pointing techniques , one conservative and one liberal , were designed , analyzed , and implemented with an eye tracker we developed . they were then tested in a pilot study . this early stage exploration showed that the magic pointing techniques might offer many advantages , including reduced physical effort and fatigue as compared to traditional manual pointing , greater accuracy and naturalness than traditional gaze pointing , and possibly faster speed than manual pointing . the pros and cons of the two techniques are discussed in light of both performance data and subjective reports"
]
inp = src_field.tokenize(source_text[0])
inp = src_field.numericalize([inp]).to(device)

# In[ ]:

result = []
enc_output = model.encoder(inp)

# In[ ]:

res = model.decoder.infer_rnn_auto_regressive(
    encoder_output_dict=enc_output, vocab=src_field.vocab,
    length=3).view(-1).detach().cpu().numpy()

# In[2]:

Exemplo n.º 5

0

Exibir arquivo

test_videos = np.load('C:/Dataset/' + data + '/test_videos.npy')
test_videos = [test_videos[i].item() for i in range(len(test_videos))]
test_captions = np.load('C:/Dataset/' + data + '/test_captions.npy')
test_captions = [test_captions[i].item() for i in range(len(test_captions))]

len(train_videos),len(train_captions), len(test_videos),len(test_captions)

import spacy
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
en = spacy.load('en')
EN_TEXT = Field(init_token='<sos>',
           eos_token='<eos>',
           tokenize=lambda captions : [ [tok.text for tok in en.tokenizer(sentence)] for sentence in captions],
           batch_first = True)
EN_TEXT.build_vocab(EN_TEXT.tokenize(train_captions))
len(EN_TEXT.vocab.stoi)

from collections import defaultdict

train_references = defaultdict(list)
for i in range(len(train_captions)):
  train_references[train_videos[i]].append(train_captions[i].split())

test_references = defaultdict(list)
for i in range(len(test_captions)):
  test_references[test_videos[i]].append(test_captions[i].split())
  
len(train_references), len(test_references)

from torch.utils.data import Dataset