def test_findall(): pattern = "[aA]" s = ["hello", "and héllo", "this was empty", ""] nvstrs = nvstrings.to_device(s) got = nvstrs.findall(pattern)[0] expected = [None, "a", "a", None] assert_eq(got, expected)
def test_ftos(): s = np.array([0, 103, -254848.5929, 8395794.248339, np.nan, np.inf], dtype=np.float32) got = nvstrings.ftos(s) expected = nvstrings.to_device( ['0.0', '103.0', '-254848.5938', '8395794.0', 'NaN', 'Inf']) assert_eq(got, expected)
def test_match(pattern): s = ["hello", "and héllo", None, ""] pstrs = pd.Series(s) nvstrs = nvstrings.to_device(s) got = nvstrs.match(pattern) expected = pstrs.str.match(pattern).values assert_eq(got, expected)
def test_count(pattern): s = ["hello", "and héllo", "this was empty", ""] pstrs = pd.Series(s) nvstrs = nvstrings.to_device(s) got = nvstrs.count(pattern) expected = pstrs.str.count(pattern).values assert_eq(got, expected)
def test_indexes_for_key(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.indexes_for_key("ccc") expected = [4, 5, 6] assert_eq(got, expected)
def test_decode_url(): s = nvstrings.to_device(urls2) got = s.url_decode() expected = [] for url in urls2: expected.append(urllib.parse.unquote(url)) assert_eq(got, expected)
def test_from_offsets_with_bitmask(): values = np.array([97, 112, 112, 108, 101], dtype=np.int8) offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32) bitmask = np.array([29], dtype=np.int8) s = nvstrings.from_offsets(values, offsets, 5, bitmask, 1) expected = ['a', None, 'p', 'l', 'e'] assert_eq(s, expected)
def eval(self, train_xs, test_xs): # print(train_xs.shape) # print(test_xs.shape) batch_size = 1000 utils.assert_eq(len(train_xs) % batch_size, 0) utils.assert_eq(len(test_xs) % batch_size, 0) train_fes = np.zeros(len(train_xs) // batch_size) test_fes = np.zeros(len(test_xs) // batch_size) x_shape = (batch_size, ) + train_xs.shape[1:] x_node = Variable(torch.cuda.FloatTensor(*x_shape)) for i in xrange(len(train_fes)): x_node.data.copy_( torch.from_numpy(train_xs[i * batch_size:(i + 1) * batch_size])) train_fes[i] = self.net_f(x_node).data[0] for i in xrange(len(test_fes)): # print(test_xs.shape) x_node.data.copy_( torch.from_numpy(test_xs[i * batch_size:(i + 1) * batch_size])) test_fes[i] = self.net_f(x_node).data[0] mean_train_fes = train_fes.mean() mean_test_fes = test_fes.mean() log = 'Eval:\n' log += '\tfree_energy on train: %s;\n' % mean_train_fes log += '\tfree_energy on test: %s;\n' % mean_test_fes log += '\tratio: %s' % np.exp(mean_train_fes - mean_test_fes) print(log) return log
def test_fillna(repl): s = ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""] strs = nvstrings.to_device(s) pstrs = pd.Series(s) got = strs.fillna(repl) expected = pstrs.fillna(repl) assert_eq(got.to_host(), expected)
def tokenize(self, max_length=14): """Tokenizes the questions. This will add q_token in each entry of the dataset. -1 represent nil, and should be treated as padding_idx in embedding """ for entry in self.entries: tokens = self.dictionary.tokenize(entry['question'], False) tokens = tokens[:max_length] if len(tokens) < max_length: # Note here we pad in front of the sentence padding = [self.dictionary.padding_idx ] * (max_length - len(tokens)) tokens = padding + tokens utils.assert_eq(len(tokens), max_length) entry['q_token'] = tokens if entry['caption']: tokens = self.dictionary.tokenize(entry['caption'], False) tokens = tokens[:50] if len(tokens) < 50: # Note here we pad in front of the sentence padding = [self.dictionary.padding_idx ] * (50 - len(tokens)) tokens = padding + tokens utils.assert_eq(len(tokens), 50) entry['c_token'] = tokens else: entry['c_token'] = [0] * 50
def test_get(index): index = 0 s = ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""] strs = nvstrings.to_device(s) got = strs.get(index) expected = ['a', '0', '9', None, 'a', ''] assert_eq(got.to_host(), expected)
def test_replace(find, replace): s = ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""] pstrs = pd.Series(s) nvstrs = nvstrings.to_device(s) got = nvstrs.replace(find, replace, regex=False) expected = pstrs.str.replace(find, replace, regex=False) assert_eq(got, expected)
def test_slice_replace(start, stop, repl): s = ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""] strs = nvstrings.to_device(s) pstrs = pd.Series(s) got = strs.slice_replace(start, stop, repl) expected = pstrs.str.slice_replace(start, stop, repl) assert_eq(got.to_host(), expected)
def test_slice_from(): strs = nvstrings.to_device( ["hello world", "holy accéntéd", "batman", None, ""]) d_arr = rmm.to_device(np.asarray([2, 3, -1, -1, -1], dtype=np.int32)) got = strs.slice_from(starts=d_arr.device_ctypes_pointer.value) expected = ['llo world', 'y accéntéd', '', None, ''] assert_eq(got, expected)
def test_cat(): strs = nvstrings.to_device( ["abc", "def", None, "", "jkl", "mno", "accént"] ) got = strs.cat() expected = ['abcdefjklmnoaccént'] assert_eq(got, expected) # non-default separator got = strs.cat(sep=':') expected = ['abc:def::jkl:mno:accént'] assert_eq(got, expected) # non default separator and na_rep got = strs.cat(sep=':', na_rep='_') expected = ['abc:def:_::jkl:mno:accént'] assert_eq(got, expected) # non-null others, default separator, and na_rep strs2 = nvstrings.to_device(["1", "2", "3", "4", "5", "é", None]) got = strs.cat(strs2, sep=":", na_rep="_") expected = ['abc:1', 'def:2', '_:3', ':4', 'jkl:5', 'mno:é', 'accént:_'] assert_eq(got, expected) # nvstrings others strs2 = nvstrings.to_device(["1", "2", "3", None, "5", "é", ""]) got = strs.cat(strs2) expected = ['abc1', 'def2', None, None, 'jkl5', 'mnoé', 'accént'] assert_eq(got, expected)
def test_rstrip(): s = [" hello ", " there ", " world ", None, " accénté ", ""] strs = nvstrings.to_device(s) pstrs = pd.Series(s) got = strs.rstrip() expected = pstrs.str.rstrip() assert_eq(got.to_host(), expected)
def test_encode_url(): s = nvstrings.to_device(urls1) got = s.url_encode() expected = [] for url in urls1: expected.append(urllib.parse.quote(url, safe="~")) assert_eq(got, expected)
def test_values(): strs = nvstrings.to_device( ["eee", "aaa", "eee", "ddd", "ccc", "ccc", "ccc", "eee", "aaa"]) cat = nvcategory.from_strings(strs) got = cat.values() expected = [3, 0, 3, 2, 1, 1, 1, 3, 0] assert_eq(got, expected)
def test_extract(): pattern = r"Flight:([A-Z]+)(\d+)" s = [ "ALA-PEK Flight:HU7934", "HKT-PEK Flight:CA822", "FRA-PEK Flight:LA8769", "FRA-PEK Flight:LH7332", "", None, "Flight:ZZ", ] nvstrs = nvstrings.to_device(s) got = nvstrs.extract(pattern) expected = np.array( [ ["HU", "7934"], ["CA", "822"], ["LA", "8769"], ["LH", "7332"], [None, None], [None, None], [None, None], ] ) assert_eq(got[0], expected[:, 0]) assert_eq(got[1], expected[:, 1])
def test_remove_unused_keys(): strs1 = nvstrings.to_device(["a", "b", "b", "f", "c", "f"]) strs2 = nvstrings.to_device(["b", "c", "e", "d"]) cat = nvcategory.from_strings(strs1) cat1 = cat.set_keys(strs2) cat1_unused_removed = cat1.remove_unused_keys() assert_eq(cat1_unused_removed.keys(), ["b", "c"])
def test_dtos(): s = np.array([0, 103342.313, -25.4294, 839542223232.794248339], dtype=np.float64) got = nvstrings.dtos(s) expected = nvstrings.to_device( ['0', '103342.313', '-25.4294', '8.395422232e+11']) assert_eq(got, expected)
def tokenize(self, max_qu_length=14, max_cap_length=18): """Tokenizes the questions. This will add q_token in each entry of the dataset. -1 represent nil, and should be treated as padding_idx in embedding """ for entry in self.entries: qtokens = self.question_dictionary.tokenize( entry['question'], False) qtokens = qtokens[:max_qu_length] ctokens = [self.caption_dictionary.word2idx['<start>']] ctokens.extend( self.caption_dictionary.tokenize(entry['caption'], False)) ctokens.append(self.caption_dictionary.word2idx['<end>']) ctokens = ctokens[:max_cap_length] if len(qtokens) < max_qu_length: # Note here we pad in front of the sentence qpadding = [self.question_dictionary.padding_idx] \ * (max_qu_length - len(qtokens)) qtokens = qpadding + qtokens utils.assert_eq(len(qtokens), max_qu_length) entry['c_len'] = len(ctokens) if len(ctokens) < max_cap_length: # Note here we pad in front of the sentence cpadding = [self.caption_dictionary.padding_idx] \ * (max_cap_length - len(ctokens)) ctokens = ctokens + cpadding utils.assert_eq(len(ctokens), max_cap_length) entry['q_token'] = qtokens entry['c_token'] = ctokens
def test_extract_record(): pattern = r"Flight:([A-Z]+)(\d+)" s = [ "ALA-PEK Flight:HU7934", "HKT-PEK Flight:CA822", "FRA-PEK Flight:LA8769", "FRA-PEK Flight:LH7332", "", None, "Flight:ZZ", ] nvstrs = nvstrings.to_device(s) got = nvstrs.extract_record(pattern) expected = np.array( [ ["HU", "7934"], ["CA", "822"], ["LA", "8769"], ["LH", "7332"], [None, None], [None, None], [None, None], ] ) for i in range(len(got)): assert_eq(got[i], expected[i, :])
def _load_dataset(dataroot, name, img_id2val, label2ans, args): """Load entries img_id2val: dict {img_id -> val} val can be used to retrieve image or features_path data_root: root path of dataset name: 'train', 'val', 'test-dev2015', test2015' """ question_path = os.path.join(dataroot, 'questions/%s_questions.json' % (name)) print(f"Question Path : {question_path}") if name == 'trainval': combine_trainval(dataroot) questions = json.load(open(question_path)) if 'questions' in questions: questions = questions['questions'] questions = sorted(questions, key=lambda x: x['question_id']) answer_not_found = 0 print(name) # if 'test2015' not in name or 'test_dev' not in name: if 'test_dev' not in name: qn_id_to_ans = {} answer_path = os.path.join(dataroot, 'features', '%s_target.json' % name) print(f"Answer Path : {answer_path}") answers = json.load(open(answer_path, 'r')) for answer in answers: qn_id_to_ans[str(answer['question_id'])] = answer entries = [] for question in questions: if str(question['question_id']) in qn_id_to_ans: answer = qn_id_to_ans[str(question['question_id'])].copy() else: answer_not_found += 1 answer = { 'question_id': question['question_id'], 'image_id': question['image_id'], 'scores': [], 'labels': [] } try: utils.assert_eq(question['question_id'], answer['question_id']) utils.assert_eq(question['image_id'], answer['image_id']) except AssertionError as e: print(e) img_id = question['image_id'] entries.append( _create_entry(img_id2val[str(img_id)], question, answer)) else: # test2015 entries = [] for question in questions: img_id = question['image_id'] entries.append( _create_entry(img_id2val[str(img_id)], question, None)) print("answers not found {}".format(answer_not_found)) return entries
def test_match_strings(): s1 = ["hello", "here", None, "accéntéd", None, ""] s2 = ["hello", "there", "world", "accéntéd", None, ""] strs1 = nvstrings.to_device(s1) strs2 = nvstrings.to_device(s2) got = strs1.match_strings(strs2) expected = [True, False, False, True, True, True] assert_eq(got, expected)
def test_values(): narr = np.array([4, 1, 2, 3, 2, 1, 4, 1, 1]) cat = nvcategory.from_numbers(narr) values = np.empty([cat.size()], dtype=np.int32) cat.values(values) got = values.tolist() expected = [3, 0, 1, 2, 1, 0, 3, 0, 0] assert_eq(got, expected)
def test_to_numbers(): narr = np.array([2, 1, 1.25, 1.5, 1, 1.25, 1, 1, 2]) cat = nvcategory.from_numbers(narr) nbrs = np.empty([cat.size()], dtype=narr.dtype) cat.to_numbers(nbrs) got = nbrs.tolist() expected = narr.tolist() assert_eq(got, expected)
def test_order_length_alphabetical(): strs = nvstrings.to_device([ "abc", "defghi", None, "jkl", "mno", "pqr", "stu", "dog and cat", "accénted", "" ]) sorted_strs = strs.order(3) expected = [2, 9, 0, 3, 4, 5, 6, 1, 8, 7] assert_eq(sorted_strs, expected)
def test_keys(): narr = np.array([2, 1, 1.25, 1.5, 1, 1.25, 1, 1, 2]) cat = nvcategory.from_numbers(narr) keys = np.empty([cat.keys_size()], dtype=narr.dtype) cat.keys(keys) got = keys.tolist() expected = [1.0, 1.25, 1.5, 2.0] assert_eq(got, expected)
def test_from_offsets(): values = np.array([97, 112, 112, 108, 101], dtype=np.int8) offsets = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32) cat = nvcategory.from_offsets(values, offsets, 5) expected_keys = ['a', 'e', 'l', 'p'] expected_values = [0, 3, 3, 2, 1] assert_eq(cat.keys(), expected_keys) assert_eq(cat.values(), expected_values)
def __init__(self, state, action, reward, next_state, end): utils.assert_eq(type(state), type(next_state)) self._state = (state * 255.0).astype(np.uint8) self._next_state = (next_state * 255.0).astype(np.uint8) self.action = action self.reward = reward self.end = end
def loss(self, states, actions, targets): """ params: states: Variable [batch, channel, w, h] actions: Variable [batch, num_actions] one hot encoding targets: Variable [batch] """ utils.assert_eq(actions.size(1), self.num_actions) qs = self.online_q_net(states) preds = (qs * actions).sum(1) err = nn.functional.smooth_l1_loss(preds, targets) return err
def loss(self, states, actions, targets): """ params: states: Variable [batch, channel, w, h] actions: Variable [batch, num_actions] one hot encoding targets: Variable [batch, num_atoms] """ utils.assert_eq(actions.size(1), self.num_actions) actions = actions.unsqueeze(2) probs = self.online_q_net(states) # [batch, num_actions, num_atoms] probs = (probs * actions).sum(1) # [batch, num_atoms] xent = -(targets * torch.log(probs.clamp(min=utils.EPS))).sum(1) xent = xent.mean(0) return xent
def tokenize(self, max_length=14): """Tokenizes the questions. This will add q_token in each entry of the dataset. -1 represent nil, and should be treated as padding_idx in embedding """ for entry in self.entries: tokens = self.dictionary.tokenize(entry['question'], False) tokens = tokens[:max_length] if len(tokens) < max_length: # Note here we pad in front of the sentence padding = [self.dictionary.padding_idx] * (max_length - len(tokens)) tokens = padding + tokens utils.assert_eq(len(tokens), max_length) entry['q_token'] = tokens
def _load_dataset(dataroot, name, img_id2val): """Load entries img_id2val: dict {img_id -> val} val can be used to retrieve image or features dataroot: root path of dataset name: 'train', 'val' """ question_path = os.path.join( dataroot, 'v2_OpenEnded_mscoco_%s2014_questions.json' % name) questions = sorted(json.load(open(question_path))['questions'], key=lambda x: x['question_id']) answer_path = os.path.join(dataroot, 'cache', '%s_target.pkl' % name) answers = cPickle.load(open(answer_path, 'rb')) answers = sorted(answers, key=lambda x: x['question_id']) utils.assert_eq(len(questions), len(answers)) entries = [] for question, answer in zip(questions, answers): utils.assert_eq(question['question_id'], answer['question_id']) utils.assert_eq(question['image_id'], answer['image_id']) img_id = question['image_id'] entries.append(_create_entry(img_id2val[img_id], question, answer)) return entries