def process_language(self, ex, traj, r_idx, use_templated_goals=False): # goal instruction if use_templated_goals: task_desc = sample_templated_task_desc_from_traj_data(traj) else: task_desc = ex['turk_annotations']['anns'][r_idx]['task_desc'] # step-by-step instructions high_descs = ex['turk_annotations']['anns'][r_idx]['high_descs'] # tokenize language traj['ann'] = { 'goal': revtok.tokenize(remove_spaces_and_lower(task_desc)) + ['<<goal>>'], 'instr': [revtok.tokenize(remove_spaces_and_lower(x)) for x in high_descs] + [['<<stop>>']], 'repeat_idx': r_idx } # numericalize language traj['num'] = {} traj['num']['lang_goal'] = self.numericalize(self.vocab['word'], traj['ann']['goal'], train=True) traj['num']['lang_instr'] = [ self.numericalize(self.vocab['word'], x, train=True) for x in traj['ann']['instr'] ]
def process_language(self, ex, traj, r_idx): # tokenize language traj['ann'] = { 'goal': revtok.tokenize( remove_spaces_and_lower( ex['turk_annotations']['anns'][r_idx]['task_desc'])) + ['<<goal>>'], 'instr': [ revtok.tokenize(remove_spaces_and_lower(x)) for x in ex['turk_annotations']['anns'][r_idx]['high_descs'] ] + [['<<stop>>']], 'repeat_idx': r_idx } # numericalize language traj['num'] = {} traj['num']['lang_goal'] = self.numericalize(self.vocab['word'], traj['ann']['goal'], train=True) traj['num']['lang_instr'] = [ self.numericalize(self.vocab['word'], x, train=True) for x in traj['ann']['instr'] ]
def preprocess_data(cls, raw): names = sorted(list(raw.keys())) vocab = set() for name in names: split = raw[name] for ex in tqdm(split, desc='tokenizing {}'.format(name)): ex['words1'] = w = revtok.tokenize(ex['sent1']) vocab.update(w) ex['words2'] = w = revtok.tokenize(ex['sent2']) vocab.update(w) word2index = ['PAD'] + sorted(list(vocab)) index2word = {w: i for i, w in enumerate(word2index)} for name in names: split = raw[name] for ex in tqdm(split, desc='numericalizing {}'.format(name)): ex['ids1'] = [index2word[w] for w in ex['words1']] ex['mask1'] = [1] * len(ex['ids1']) ex['ids2'] = [index2word[w] for w in ex['words2']] ex['mask2'] = [1] * len(ex['ids2']) del ex['sent1'] del ex['sent2'] del ex['words1'] del ex['words2'] return dict(splits=data, word2index=word2index, index2word=index2word)
def process_language(self, ex, traj, r_idx): # tokenize language traj['num'] = {} if self.args.use_bert: traj['ann'] = { 'goal': "Goal: " + remove_spaces_and_lower( ex['turk_annotations']['anns'][r_idx]['task_desc']), 'instr': [ "Instruction: " + remove_spaces_and_lower(x) for x in ex['turk_annotations']['anns'][r_idx]['high_descs'] ] + [['End.']], 'repeat_idx': r_idx } traj['num']['lang_goal'] = self.tokenizer.encode( traj['ann']['goal'], add_special_tokens=True, max_length=self.max_length, is_pretokenized=False, truncation=True) traj['num']['lang_instr'] = [ self.tokenizer.encode(x, add_special_tokens=True, max_length=self.max_length, is_pretokenized=False, truncation=True) for x in traj['ann']['instr'] ] else: traj['ann'] = { 'goal': revtok.tokenize( remove_spaces_and_lower( ex['turk_annotations']['anns'][r_idx]['task_desc'])) + ['<<goal>>'], 'instr': [ revtok.tokenize(remove_spaces_and_lower(x)) for x in ex['turk_annotations']['anns'][r_idx]['high_descs'] ] + [['<<stop>>']], 'repeat_idx': r_idx } traj['num']['lang_goal'] = self.numericalize(self.vocab['word'], traj['ann']['goal'], train=True) traj['num']['lang_instr'] = [ self.numericalize(self.vocab['word'], x, train=True) for x in traj['ann']['instr'] ]
def process_language(self, ex, traj, r_idx): # tokenize language if not self.subgoal_ann: goal_ann = ex['turk_annotations']['anns'][r_idx]['task_desc'] instr_anns = ex['turk_annotations']['anns'][r_idx]['high_descs'] # tokenize annotations goal_ann = revtok.tokenize( py_util.remove_spaces_and_lower(goal_ann)) instr_anns = [ revtok.tokenize(py_util.remove_spaces_and_lower(instr_ann)) for instr_ann in instr_anns ] # this might be not needed goal_ann = [w.strip().lower() for w in goal_ann] instr_anns = [[w.strip().lower() for w in instr_ann] for instr_ann in instr_anns] else: goal_ann = ['<<seg>>'] instr_anns = [[a['action']] + a['action_high_args'] for a in traj['num']['action_high']] instr_anns = [[ self.vocab['action_high'].index2word(w) for w in instr_ann ] for instr_ann in instr_anns] traj['ann'] = { 'goal': goal_ann + ['<<goal>>'], 'instr': [instr_ann + ['<<instr>>'] for instr_ann in instr_anns], 'repeat_idx': r_idx } if not self.subgoal_ann: traj['ann']['instr'] += [['<<stop>>']] # convert words to tokens if 'num' not in traj: traj['num'] = {} traj['num']['lang_goal'] = self.numericalize( self.vocab['word'], traj['ann']['goal'], train=not self.is_test_split) traj['num']['lang_instr'] = [ self.numericalize(self.vocab['word'], x, train=not self.is_test_split) for x in traj['ann']['instr'] ]
def subtokenize(doc): if not doc.strip(): return [] tokens = [] for i, t in enumerate(revtok.tokenize(doc)): subtokens = bert_tokenizer.tokenize(t.strip()) for st in subtokens: tokens.append({'orig': t, 'sub': st, 'orig_id': i}) return tokens
def get_tokenizer(tokenizer): if callable(tokenizer): return tokenizer if tokenizer == "spacy": try: import spacy spacy_en = spacy.load('en') return lambda s: [tok.text for tok in spacy_en.tokenizer(s)] except ImportError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise except AttributeError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise elif tokenizer == "moses": try: from sacremoses import MosesTokenizer moses_tokenizer = MosesTokenizer() return moses_tokenizer.tokenize except ImportError: print("Please install SacreMoses. " "See the docs at https://github.com/alvations/sacremoses " "for more information.") raise elif tokenizer == "toktok": try: from nltk.tokenize.toktok import ToktokTokenizer toktok = ToktokTokenizer() return toktok.tokenize except ImportError: print("Please install NLTK. " "See the docs at https://nltk.org for more information.") raise elif tokenizer == 'revtok': try: import revtok return revtok.tokenize except ImportError: print("Please install revtok.") raise elif tokenizer == 'subword': try: import revtok return lambda x: revtok.tokenize(x, decap=True) except ImportError: print("Please install revtok.") raise raise ValueError("Requested tokenizer {}, valid choices are a " "callable that takes a single string as input, " "\"revtok\" for the revtok reversible tokenizer, " "\"subword\" for the revtok caps-aware tokenizer, " "\"spacy\" for the SpaCy English tokenizer, or " "\"moses\" for the NLTK port of the Moses tokenization " "script.".format(tokenizer))
def calculate_reward(self, all_obs): # Unpack values from input. high = [o['mission'] for o in all_obs[0]] obs = [] for i in range(len(all_obs[0])): obs.append([o[i]['image'] for o in all_obs]) # Tokenize highs. high = [ revtok.tokenize(self.remove_spaces_and_lower(h)) for h in high ] # -> M high = [ self.vocab.word2index([ w.strip().lower() if w.strip().lower() in self.vocab.to_dict()['index2word'] else '<<pad>>' for w in h ]) for h in high ] # -> M # Put on device. high = torch.tensor(high, dtype=torch.long) high = high.reshape(len(high), -1).to(self.device) # -> B x M high_len = high.bool().byte().sum(dim=1).view(-1, ).to(self.device) traj = torch.tensor(obs, dtype=torch.float).view( len(obs), len(obs[0]), self.img_shape).to(self.device) # B X M X 147 traj_len = torch.full((traj.shape[0], ), traj.shape[1]).long().to(self.device) # Compute CPV reward with new observation incorporated. with torch.no_grad(): self.eval() sims = self.compute_similarity(high, traj, high_len, traj_len) # Potential-based reward is delta in similarity between previous and current trajectory. reward = sims[:, 1:] - sims[:, :-1] reward = torch.cat([ torch.zeros((reward.shape[0], 1), dtype=torch.float).to( self.device), reward ], dim=1) return reward.detach()
def lookup_sentence(self, sent, vocab, max_len=10, eos='pad', pad='pad'): if isinstance(sent, list): words = sent[:max_len-1] + [eos] length = len(words) if len(words) < max_len: words += [pad] * (max_len - len(words)) return vocab.word2index([w.strip() for w in words]), length else: sent = sent.lower() key = sent, max_len if key not in self._cache: words = revtok.tokenize(sent)[:max_len-1] + [eos] length = len(words) if len(words) < max_len: words += [pad] * (max_len - len(words)) self._cache[key] = vocab.word2index([w.strip() for w in words]), length while len(self._cache) > self.max_cache: keys = list(self._cache.keys()) del self._cache[random.choice(keys)] return self._cache[key]
def tokenized_description(self): return revtok.tokenize(self.describe()) if hasattr(self, 'describe') else self.tokenized_name()
def tokenized_name(self): return revtok.tokenize(self.name)
def tokenize(self, sentence, field_name=None): if not sentence: return [], None return revtok.tokenize(sentence), None
def tokenize(sent): return [w.strip() for w in revtok.tokenize(sent.lower())]
self.inverse_doc_freqs = idf def _term_freqs(self, doc): counter = Counter(doc) for token in doc: counter[token] /= len(doc) return counter if __name__ == '__main__': # Interactive testing for relevant memories retrieval import revtok from dataset import Dataset dataset = Dataset() kv_memory = KeyValueMemory(dataset) print('Interactive memory retrieval. {} to cancel\n'.format( colorize('Press CTRL + C', color='white'))) try: while True: query = revtok.tokenize(input('> ').strip()) queries, responses, _ = kv_memory.address(query) for key, value in zip(queries, responses): print('\nQ: {query}'.format(query=revtok.detokenize(key))) print( 'R: {response}'.format(response=revtok.detokenize(value))) print() except (KeyboardInterrupt, EOFError): print('\n\nShutting down')