Python tokenize 예제들, revtok.tokenize Python 예제들

예제 #1

0

파일 보기

    def process_language(self, ex, traj, r_idx, use_templated_goals=False):
        # goal instruction
        if use_templated_goals:
            task_desc = sample_templated_task_desc_from_traj_data(traj)
        else:
            task_desc = ex['turk_annotations']['anns'][r_idx]['task_desc']

        # step-by-step instructions
        high_descs = ex['turk_annotations']['anns'][r_idx]['high_descs']

        # tokenize language
        traj['ann'] = {
            'goal':
            revtok.tokenize(remove_spaces_and_lower(task_desc)) + ['<<goal>>'],
            'instr':
            [revtok.tokenize(remove_spaces_and_lower(x))
             for x in high_descs] + [['<<stop>>']],
            'repeat_idx':
            r_idx
        }

        # numericalize language
        traj['num'] = {}
        traj['num']['lang_goal'] = self.numericalize(self.vocab['word'],
                                                     traj['ann']['goal'],
                                                     train=True)
        traj['num']['lang_instr'] = [
            self.numericalize(self.vocab['word'], x, train=True)
            for x in traj['ann']['instr']
        ]

예제 #2

0

파일 보기

    def process_language(self, ex, traj, r_idx):
        # tokenize language
        traj['ann'] = {
            'goal':
            revtok.tokenize(
                remove_spaces_and_lower(
                    ex['turk_annotations']['anns'][r_idx]['task_desc'])) +
            ['<<goal>>'],
            'instr': [
                revtok.tokenize(remove_spaces_and_lower(x))
                for x in ex['turk_annotations']['anns'][r_idx]['high_descs']
            ] + [['<<stop>>']],
            'repeat_idx':
            r_idx
        }

        # numericalize language
        traj['num'] = {}
        traj['num']['lang_goal'] = self.numericalize(self.vocab['word'],
                                                     traj['ann']['goal'],
                                                     train=True)
        traj['num']['lang_instr'] = [
            self.numericalize(self.vocab['word'], x, train=True)
            for x in traj['ann']['instr']
        ]

예제 #3

0

파일 보기

    def preprocess_data(cls, raw):
        names = sorted(list(raw.keys()))
        vocab = set()
        for name in names:
            split = raw[name]
            for ex in tqdm(split, desc='tokenizing {}'.format(name)):
                ex['words1'] = w = revtok.tokenize(ex['sent1'])
                vocab.update(w)
                ex['words2'] = w = revtok.tokenize(ex['sent2'])
                vocab.update(w)
        word2index = ['PAD'] + sorted(list(vocab))
        index2word = {w: i for i, w in enumerate(word2index)}

        for name in names:
            split = raw[name]
            for ex in tqdm(split, desc='numericalizing {}'.format(name)):
                ex['ids1'] = [index2word[w] for w in ex['words1']]
                ex['mask1'] = [1] * len(ex['ids1'])
                ex['ids2'] = [index2word[w] for w in ex['words2']]
                ex['mask2'] = [1] * len(ex['ids2'])
                del ex['sent1']
                del ex['sent2']
                del ex['words1']
                del ex['words2']

        return dict(splits=data, word2index=word2index, index2word=index2word)

예제 #4

0

파일 보기

파일: preprocess.py 프로젝트: USC-CSCI527-Spring2021/LingoRobo

    def process_language(self, ex, traj, r_idx):

        # tokenize language
        traj['num'] = {}

        if self.args.use_bert:
            traj['ann'] = {
                'goal':
                "Goal: " + remove_spaces_and_lower(
                    ex['turk_annotations']['anns'][r_idx]['task_desc']),
                'instr': [
                    "Instruction: " + remove_spaces_and_lower(x) for x in
                    ex['turk_annotations']['anns'][r_idx]['high_descs']
                ] + [['End.']],
                'repeat_idx':
                r_idx
            }

            traj['num']['lang_goal'] = self.tokenizer.encode(
                traj['ann']['goal'],
                add_special_tokens=True,
                max_length=self.max_length,
                is_pretokenized=False,
                truncation=True)
            traj['num']['lang_instr'] = [
                self.tokenizer.encode(x,
                                      add_special_tokens=True,
                                      max_length=self.max_length,
                                      is_pretokenized=False,
                                      truncation=True)
                for x in traj['ann']['instr']
            ]

        else:

            traj['ann'] = {
                'goal':
                revtok.tokenize(
                    remove_spaces_and_lower(
                        ex['turk_annotations']['anns'][r_idx]['task_desc'])) +
                ['<<goal>>'],
                'instr': [
                    revtok.tokenize(remove_spaces_and_lower(x)) for x in
                    ex['turk_annotations']['anns'][r_idx]['high_descs']
                ] + [['<<stop>>']],
                'repeat_idx':
                r_idx
            }
            traj['num']['lang_goal'] = self.numericalize(self.vocab['word'],
                                                         traj['ann']['goal'],
                                                         train=True)
            traj['num']['lang_instr'] = [
                self.numericalize(self.vocab['word'], x, train=True)
                for x in traj['ann']['instr']
            ]

예제 #5

0

파일 보기

    def process_language(self, ex, traj, r_idx):
        # tokenize language
        if not self.subgoal_ann:
            goal_ann = ex['turk_annotations']['anns'][r_idx]['task_desc']
            instr_anns = ex['turk_annotations']['anns'][r_idx]['high_descs']
            # tokenize annotations
            goal_ann = revtok.tokenize(
                py_util.remove_spaces_and_lower(goal_ann))
            instr_anns = [
                revtok.tokenize(py_util.remove_spaces_and_lower(instr_ann))
                for instr_ann in instr_anns
            ]
            # this might be not needed
            goal_ann = [w.strip().lower() for w in goal_ann]
            instr_anns = [[w.strip().lower() for w in instr_ann]
                          for instr_ann in instr_anns]
        else:
            goal_ann = ['<<seg>>']
            instr_anns = [[a['action']] + a['action_high_args']
                          for a in traj['num']['action_high']]
            instr_anns = [[
                self.vocab['action_high'].index2word(w) for w in instr_ann
            ] for instr_ann in instr_anns]

        traj['ann'] = {
            'goal': goal_ann + ['<<goal>>'],
            'instr': [instr_ann + ['<<instr>>'] for instr_ann in instr_anns],
            'repeat_idx': r_idx
        }
        if not self.subgoal_ann:
            traj['ann']['instr'] += [['<<stop>>']]

        # convert words to tokens
        if 'num' not in traj:
            traj['num'] = {}
        traj['num']['lang_goal'] = self.numericalize(
            self.vocab['word'],
            traj['ann']['goal'],
            train=not self.is_test_split)
        traj['num']['lang_instr'] = [
            self.numericalize(self.vocab['word'],
                              x,
                              train=not self.is_test_split)
            for x in traj['ann']['instr']
        ]

예제 #6

0

파일 보기

def subtokenize(doc):
    if not doc.strip():
        return []
    tokens = []
    for i, t in enumerate(revtok.tokenize(doc)):
        subtokens = bert_tokenizer.tokenize(t.strip())
        for st in subtokens:
            tokens.append({'orig': t, 'sub': st, 'orig_id': i})
    return tokens

예제 #7

0

파일 보기

파일: utils.py 프로젝트: iclementine/text

def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from sacremoses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install SacreMoses. "
                  "See the docs at https://github.com/alvations/sacremoses "
                  "for more information.")
            raise
    elif tokenizer == "toktok":
        try:
            from nltk.tokenize.toktok import ToktokTokenizer
            toktok = ToktokTokenizer()
            return toktok.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at https://nltk.org  for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return lambda x: revtok.tokenize(x, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))

예제 #8

0

파일 보기

파일: cpv_reward.py 프로젝트: AnjaliRuban/babyai

    def calculate_reward(self, all_obs):

        # Unpack values from input.
        high = [o['mission'] for o in all_obs[0]]

        obs = []
        for i in range(len(all_obs[0])):
            obs.append([o[i]['image'] for o in all_obs])

        # Tokenize highs.
        high = [
            revtok.tokenize(self.remove_spaces_and_lower(h)) for h in high
        ]  # -> M
        high = [
            self.vocab.word2index([
                w.strip().lower() if w.strip().lower()
                in self.vocab.to_dict()['index2word'] else '<<pad>>' for w in h
            ]) for h in high
        ]  # -> M

        # Put on device.
        high = torch.tensor(high, dtype=torch.long)
        high = high.reshape(len(high), -1).to(self.device)  # -> B x M
        high_len = high.bool().byte().sum(dim=1).view(-1, ).to(self.device)

        traj = torch.tensor(obs, dtype=torch.float).view(
            len(obs), len(obs[0]),
            self.img_shape).to(self.device)  # B X M X 147
        traj_len = torch.full((traj.shape[0], ),
                              traj.shape[1]).long().to(self.device)

        # Compute CPV reward with new observation incorporated.
        with torch.no_grad():
            self.eval()
            sims = self.compute_similarity(high, traj, high_len, traj_len)

        # Potential-based reward is delta in similarity between previous and current trajectory.
        reward = sims[:, 1:] - sims[:, :-1]
        reward = torch.cat([
            torch.zeros((reward.shape[0], 1), dtype=torch.float).to(
                self.device), reward
        ],
                           dim=1)

        return reward.detach()

예제 #9

0

파일 보기

파일: featurizer.py 프로젝트: ZhengyaoJiang/GTG

 def lookup_sentence(self, sent, vocab, max_len=10, eos='pad', pad='pad'):
     if isinstance(sent, list):
         words = sent[:max_len-1] + [eos]
         length = len(words)
         if len(words) < max_len:
             words += [pad] * (max_len - len(words))
         return vocab.word2index([w.strip() for w in words]), length
     else:
         sent = sent.lower()
         key = sent, max_len
         if key not in self._cache:
             words = revtok.tokenize(sent)[:max_len-1] + [eos]
             length = len(words)
             if len(words) < max_len:
                 words += [pad] * (max_len - len(words))
             self._cache[key] = vocab.word2index([w.strip() for w in words]), length
             while len(self._cache) > self.max_cache:
                 keys = list(self._cache.keys())
                 del self._cache[random.choice(keys)]
         return self._cache[key]

예제 #10

0

파일 보기

 def tokenized_description(self):
     return revtok.tokenize(self.describe()) if hasattr(self, 'describe') else self.tokenized_name()

예제 #11

0

파일 보기

 def tokenized_name(self):
     return revtok.tokenize(self.name)

예제 #12

0

파일 보기

파일: base_task.py 프로젝트: abhinavrangarajan/genienlp

 def tokenize(self, sentence, field_name=None):
     if not sentence:
         return [], None
     return revtok.tokenize(sentence), None

예제 #13

0

파일 보기

def tokenize(sent):
    return [w.strip() for w in revtok.tokenize(sent.lower())]

예제 #14

0

파일 보기

파일: memory.py 프로젝트: qianrenjian/lucy

        self.inverse_doc_freqs = idf

    def _term_freqs(self, doc):
        counter = Counter(doc)
        for token in doc:
            counter[token] /= len(doc)
        return counter


if __name__ == '__main__':
    # Interactive testing for relevant memories retrieval
    import revtok
    from dataset import Dataset

    dataset = Dataset()
    kv_memory = KeyValueMemory(dataset)

    print('Interactive memory retrieval. {} to cancel\n'.format(
        colorize('Press CTRL + C', color='white')))
    try:
        while True:
            query = revtok.tokenize(input('> ').strip())
            queries, responses, _ = kv_memory.address(query)
            for key, value in zip(queries, responses):
                print('\nQ: {query}'.format(query=revtok.detokenize(key)))
                print(
                    'R: {response}'.format(response=revtok.detokenize(value)))
            print()
    except (KeyboardInterrupt, EOFError):
        print('\n\nShutting down')