def get_seen_edges(self, seen_nodes, aug_scanned_edges, tc=None): """ seen_nodes: (np.array) n_seen_nodes x 2, (eg_idx, vj) unique but not sorted aug_scanned_edges: (np.array) n_aug_scanned_edges x 8, (eg_idx, vi, vj, rel, idx_vi, idx_vj, new_idx_e2vi, new_idx_e2vj) sorted by (eg_idx, vi, vj) """ if tc is not None: t0 = time.time() aug_scanned_vj = aug_scanned_edges[:, [0, 2]].copy( ) # n_aug_scanned_edges x 2, (eg_idx, vj) not unique and not sorted mask_vj = np.in1d(aug_scanned_vj.view('<i4,<i4'), seen_nodes.view('<i4,<i4')) seen_edges = aug_scanned_edges[ mask_vj][:, : 4] # n_seen_edges x 4, (eg_idx, vi, vj, rel) sorted by (eg_idx, vi, vj) seen_idx_for_edges_y = np.expand_dims( np.arange(mask_vj.shape[0])[mask_vj], 1).astype('int32') # n_seen_edges x 1 idx_vi = get_segment_ids(seen_edges[:, [0, 1]]) _, idx_vj = np.unique(seen_edges[:, [0, 2]], axis=0, return_inverse=True) idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1) idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1) seen_edges = np.concatenate((seen_edges, idx_vi, idx_vj), axis=1) if tc is not None: tc['seen_e'] += time.time() - t0 # seen_edges: n_seen_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj) # seen_idx_for_edges_y: n_seen_edges x 1 return seen_edges, seen_idx_for_edges_y
def preprocess_data(self, data, shuffle, bucketing): self._append_eos(data) data = self._get_batches(data, shuffle=shuffle, bucketing=bucketing) res = [] for batch in data: texts, keyword_pos = zip(*batch) lens = torch.as_tensor([len(s) for s in texts], dtype=torch.long) if keyword_pos[0] is None: keywords = keyword_pos = None else: keywords = torch.as_tensor([t[p] for t, p in batch], dtype=torch.long) keyword_pos = torch.as_tensor(keyword_pos, dtype=torch.long) texts = self._pad_batch_sequence(texts) res.append([texts, lens, keywords, keyword_pos]) if self.opts.need_segment_ids: segment_ids = get_segment_ids( texts, self.SEP_token, 0, relative=self.opts.segment_emb_relative, sep_as_new_segment=self.opts.sep_as_new_segment) res[-1].append(segment_ids) if self.opts.need_remain_syllables: remain_syllables = get_remain_syllables(self.word2syllable, self.SEP_token, decoder_target=texts) res[-1].append(remain_syllables) return res
def get_union_edges(self, scanned_edges, selfloop_edges, tc=None): """ scanned_edges: (np.array) n_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj) selfloop_edges: (np.array) n_selfloop_edges x 4 (eg_idx, vi, vi, selfloop) """ if tc is not None: t0 = time.time() scanned_edges = np.zeros((0, 4), dtype='int32') if len( scanned_edges) == 0 else scanned_edges[:, : 4] # (eg_idx, vi, vj, rel) all_edges = np.concatenate([scanned_edges, selfloop_edges], axis=0).copy() sorted_idx = np.squeeze( np.argsort(all_edges.view('<i4,<i4,<i4,<i4'), order=['f0', 'f1', 'f2'], axis=0), 1).astype('int32') aug_scanned_edges = all_edges[sorted_idx] # sorted by (eg_idx, vi, vj) idx_vi = get_segment_ids(aug_scanned_edges[:, [0, 1]]) _, idx_vj = np.unique(aug_scanned_edges[:, [0, 2]], axis=0, return_inverse=True) idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1) idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1) aug_scanned_edges = np.concatenate([aug_scanned_edges, idx_vi, idx_vj], axis=1) if tc is not None: tc['union_e'] += time.time() - t0 # aug_scanned_edges: n_aug_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj) return aug_scanned_edges
def get_selected_edges(self, sampled_edges, tc=None): """ sampled_edges: (np.array) n_sampled_edges x 6, (eg_idx, edge_id, vi, vj, rel, ca_idx) sorted by (eg_idx, edge_id) """ if tc is not None: t0 = time.time() if len(sampled_edges) == 0: return np.zeros((0, 6), dtype='int32') idx_vi = get_segment_ids(sampled_edges[:, [0, 2]]) _, idx_vj = np.unique(sampled_edges[:, [0, 3]], axis=0, return_inverse=True) idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1) idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1) selected_edges = np.concatenate( [sampled_edges[:, [0, 2, 3, 4]], idx_vi, idx_vj], axis=1) if tc is not None: tc['sele_e'] += time.time() - t0 # selected_edges: (np.array) n_selected_edges (=n_sampled_edges) x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj] # sorted by (eg_idx, vi, vj) return selected_edges
def _update_cache(self, input_step): if self.cache is None: ids = get_segment_ids(input_step, self.SEP_token, 0, self.relative, self.sep_as_new_segment) else: assert input_step.size(0) == 1, "only support step by step update" old, last_step = self.cache new_ids = old[-1].clone() step = input_step[-1] if self.sep_as_new_segment else last_step new_ids[step == self.SEP_token] += 1 if self.relative: new_ids %= 2 ids = torch.cat([old, new_ids.unsqueeze(0)], dim=0) self.clear_cache() self.cache = (ids, input_step[-1]) return ids
def get_segment_ids(self, input, align_pos=0, use_cache=False, restrict=True): if not use_cache: segment_ids = get_segment_ids(input, self.SEP_token, align_pos, self.relative, self.sep_as_new_segment) else: if self.cache is None: segment_ids = self._update_cache(input) else: segment_ids = self._update_cache(input[-1:]) assert segment_ids.shape == input.shape mask = segment_ids >= self.num_segments if mask.any(): if restrict: raise RuntimeError("segment id is greater than the maximum embedding index") segment_ids = segment_ids.masked_fill(mask, self.num_segments - 1) return segment_ids
def get_union_edges(self, scanned_edges, selfloope_edges, backtrace_edges, tc=None): """ scanned_edges: (np.array) n_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj) selfloop_edges: (np.array) n_selfloop_edges x 4 (eg_idx, vi, vi, selfloop) backtrace_edges: (np.array) n_backtrace_edges x 4 (eg_idx, vj, vi, backtrace) """ if tc is not None: t0 = time.time() scanned_edges = scanned_edges[:, :4] # (eg_idx, vi, vj, rel) n_scanned_edges = scanned_edges.shape[0] if backtrace_edges is None: all_edges = np.concatenate([scanned_edges, selfloope_edges], axis=0) else: all_edges = np.concatenate( [scanned_edges, selfloope_edges, backtrace_edges], axis=0) sorted_idx = np.squeeze( np.argsort(all_edges.view('<i4,<i4,<i4,<i4'), order=['f0', 'f1', 'f2'], axis=0), 1).astype('int32') new_idx = np.argsort(sorted_idx).astype('int32') new_idx_for_edges_y = np.expand_dims(new_idx[:n_scanned_edges], 1) rest_idx = np.expand_dims(new_idx[n_scanned_edges:], 1) aug_scanned_edges = all_edges[sorted_idx] # sorted by (eg_idx, vi, vj) idx_vi = get_segment_ids(aug_scanned_edges[:, [0, 1]]) _, idx_vj = np.unique(aug_scanned_edges[:, [0, 2]], axis=0, return_inverse=True) idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1) idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1) aug_scanned_edges = np.concatenate([aug_scanned_edges, idx_vi, idx_vj], axis=1) if tc is not None: tc['union_e'] += time.time() - t0 # aug_scanned_edges: n_aug_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj) return aug_scanned_edges, new_idx_for_edges_y, rest_idx
def preprocess_data(self, data, shuffle, bucketing): self._append_eos(data) data = self._get_batches(data, shuffle=shuffle, bucketing=bucketing) res = [] for batch in data: texts, keyword_pos = zip(*batch) lens = torch.as_tensor([len(s) for s in texts], dtype=torch.long) if keyword_pos[0] is None: keywords = keyword_pos = None else: keywords = torch.as_tensor([t[p] for t, p in batch], dtype=torch.long) keyword_pos = torch.as_tensor(keyword_pos, dtype=torch.long) fwd_tgt = self._pad_batch_sequence(texts) bwd_src = fwd_tgt.flip(0) res.append([[fwd_tgt, bwd_src], lens, keywords, keyword_pos]) if self.opts.need_segment_ids: fwd_seg_ids = get_segment_ids( fwd_tgt, self.SEP_token, 0, relative=self.opts.segment_emb_relative, sep_as_new_segment=self.opts.sep_as_new_segment) bwd_seg_ids = fwd_seg_ids.flip(0) res[-1].append([fwd_seg_ids, bwd_seg_ids]) if self.opts.fwd_need_remain_syllables or self.opts.bwd_need_remain_syllables: fwd_rem_syl = None if self.opts.fwd_need_remain_syllables: fwd_rem_syl = get_remain_syllables(self.word2syllable, self.SEP_token, decoder_target=fwd_tgt) bwd_rem_syl = None if self.opts.bwd_need_remain_syllables: bwd_rem_syl = get_remain_syllables(self.word2syllable, self.SEP_token, decoder_input=bwd_src) res[-1].append([fwd_rem_syl, bwd_rem_syl]) return res