예제 #1
0
파일: dataenv.py 프로젝트: netpaladinx/UCAN
    def get_seen_edges(self, seen_nodes, aug_scanned_edges, tc=None):
        """ seen_nodes: (np.array) n_seen_nodes x 2, (eg_idx, vj) unique but not sorted
            aug_scanned_edges: (np.array) n_aug_scanned_edges x 8,
                (eg_idx, vi, vj, rel, idx_vi, idx_vj, new_idx_e2vi, new_idx_e2vj) sorted by (eg_idx, vi, vj)
        """
        if tc is not None:
            t0 = time.time()

        aug_scanned_vj = aug_scanned_edges[:, [0, 2]].copy(
        )  # n_aug_scanned_edges x 2, (eg_idx, vj) not unique and not sorted
        mask_vj = np.in1d(aug_scanned_vj.view('<i4,<i4'),
                          seen_nodes.view('<i4,<i4'))
        seen_edges = aug_scanned_edges[
            mask_vj][:, :
                     4]  # n_seen_edges x 4, (eg_idx, vi, vj, rel) sorted by (eg_idx, vi, vj)

        seen_idx_for_edges_y = np.expand_dims(
            np.arange(mask_vj.shape[0])[mask_vj],
            1).astype('int32')  # n_seen_edges x 1

        idx_vi = get_segment_ids(seen_edges[:, [0, 1]])
        _, idx_vj = np.unique(seen_edges[:, [0, 2]],
                              axis=0,
                              return_inverse=True)
        idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1)
        idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1)
        seen_edges = np.concatenate((seen_edges, idx_vi, idx_vj), axis=1)

        if tc is not None:
            tc['seen_e'] += time.time() - t0
        # seen_edges: n_seen_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj)
        # seen_idx_for_edges_y: n_seen_edges x 1
        return seen_edges, seen_idx_for_edges_y
예제 #2
0
 def preprocess_data(self, data, shuffle, bucketing):
     self._append_eos(data)
     data = self._get_batches(data, shuffle=shuffle, bucketing=bucketing)
     res = []
     for batch in data:
         texts, keyword_pos = zip(*batch)
         lens = torch.as_tensor([len(s) for s in texts], dtype=torch.long)
         if keyword_pos[0] is None:
             keywords = keyword_pos = None
         else:
             keywords = torch.as_tensor([t[p] for t, p in batch],
                                        dtype=torch.long)
             keyword_pos = torch.as_tensor(keyword_pos, dtype=torch.long)
         texts = self._pad_batch_sequence(texts)
         res.append([texts, lens, keywords, keyword_pos])
         if self.opts.need_segment_ids:
             segment_ids = get_segment_ids(
                 texts,
                 self.SEP_token,
                 0,
                 relative=self.opts.segment_emb_relative,
                 sep_as_new_segment=self.opts.sep_as_new_segment)
             res[-1].append(segment_ids)
         if self.opts.need_remain_syllables:
             remain_syllables = get_remain_syllables(self.word2syllable,
                                                     self.SEP_token,
                                                     decoder_target=texts)
             res[-1].append(remain_syllables)
     return res
예제 #3
0
    def get_union_edges(self, scanned_edges, selfloop_edges, tc=None):
        """ scanned_edges: (np.array) n_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj)
            selfloop_edges: (np.array) n_selfloop_edges x 4 (eg_idx, vi, vi, selfloop)
        """
        if tc is not None:
            t0 = time.time()

        scanned_edges = np.zeros((0, 4), dtype='int32') if len(
            scanned_edges) == 0 else scanned_edges[:, :
                                                   4]  # (eg_idx, vi, vj, rel)
        all_edges = np.concatenate([scanned_edges, selfloop_edges],
                                   axis=0).copy()
        sorted_idx = np.squeeze(
            np.argsort(all_edges.view('<i4,<i4,<i4,<i4'),
                       order=['f0', 'f1', 'f2'],
                       axis=0), 1).astype('int32')
        aug_scanned_edges = all_edges[sorted_idx]  # sorted by (eg_idx, vi, vj)
        idx_vi = get_segment_ids(aug_scanned_edges[:, [0, 1]])
        _, idx_vj = np.unique(aug_scanned_edges[:, [0, 2]],
                              axis=0,
                              return_inverse=True)
        idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1)
        idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1)
        aug_scanned_edges = np.concatenate([aug_scanned_edges, idx_vi, idx_vj],
                                           axis=1)

        if tc is not None:
            tc['union_e'] += time.time() - t0
        # aug_scanned_edges: n_aug_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj)
        return aug_scanned_edges
예제 #4
0
    def get_selected_edges(self, sampled_edges, tc=None):
        """ sampled_edges: (np.array) n_sampled_edges x 6, (eg_idx, edge_id, vi, vj, rel, ca_idx) sorted by (eg_idx, edge_id)
        """
        if tc is not None:
            t0 = time.time()

        if len(sampled_edges) == 0:
            return np.zeros((0, 6), dtype='int32')

        idx_vi = get_segment_ids(sampled_edges[:, [0, 2]])
        _, idx_vj = np.unique(sampled_edges[:, [0, 3]],
                              axis=0,
                              return_inverse=True)

        idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1)
        idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1)

        selected_edges = np.concatenate(
            [sampled_edges[:, [0, 2, 3, 4]], idx_vi, idx_vj], axis=1)

        if tc is not None:
            tc['sele_e'] += time.time() - t0
        # selected_edges: (np.array) n_selected_edges (=n_sampled_edges) x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj]
        #   sorted by (eg_idx, vi, vj)
        return selected_edges
예제 #5
0
 def _update_cache(self, input_step):
     if self.cache is None:
         ids = get_segment_ids(input_step, self.SEP_token, 0, self.relative, self.sep_as_new_segment)
     else:
         assert input_step.size(0) == 1, "only support step by step update"
         old, last_step = self.cache
         new_ids = old[-1].clone()
         step = input_step[-1] if self.sep_as_new_segment else last_step
         new_ids[step == self.SEP_token] += 1
         if self.relative:
             new_ids %= 2
         ids = torch.cat([old, new_ids.unsqueeze(0)], dim=0)
     self.clear_cache()
     self.cache = (ids, input_step[-1])
     return ids
예제 #6
0
 def get_segment_ids(self, input, align_pos=0, use_cache=False, restrict=True):
     if not use_cache:
         segment_ids = get_segment_ids(input, self.SEP_token, align_pos, self.relative, self.sep_as_new_segment)
     else:
         if self.cache is None:
             segment_ids = self._update_cache(input)
         else:
             segment_ids = self._update_cache(input[-1:])
     assert segment_ids.shape == input.shape
     mask = segment_ids >= self.num_segments
     if mask.any():
         if restrict:
             raise RuntimeError("segment id is greater than the maximum embedding index")
         segment_ids = segment_ids.masked_fill(mask, self.num_segments - 1)
     return segment_ids
예제 #7
0
파일: dataenv.py 프로젝트: netpaladinx/UCAN
    def get_union_edges(self,
                        scanned_edges,
                        selfloope_edges,
                        backtrace_edges,
                        tc=None):
        """ scanned_edges: (np.array) n_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj)
            selfloop_edges: (np.array) n_selfloop_edges x 4 (eg_idx, vi, vi, selfloop)
            backtrace_edges: (np.array) n_backtrace_edges x 4 (eg_idx, vj, vi, backtrace)
        """
        if tc is not None:
            t0 = time.time()

        scanned_edges = scanned_edges[:, :4]  # (eg_idx, vi, vj, rel)
        n_scanned_edges = scanned_edges.shape[0]
        if backtrace_edges is None:
            all_edges = np.concatenate([scanned_edges, selfloope_edges],
                                       axis=0)
        else:
            all_edges = np.concatenate(
                [scanned_edges, selfloope_edges, backtrace_edges], axis=0)
        sorted_idx = np.squeeze(
            np.argsort(all_edges.view('<i4,<i4,<i4,<i4'),
                       order=['f0', 'f1', 'f2'],
                       axis=0), 1).astype('int32')

        new_idx = np.argsort(sorted_idx).astype('int32')
        new_idx_for_edges_y = np.expand_dims(new_idx[:n_scanned_edges], 1)
        rest_idx = np.expand_dims(new_idx[n_scanned_edges:], 1)

        aug_scanned_edges = all_edges[sorted_idx]  # sorted by (eg_idx, vi, vj)
        idx_vi = get_segment_ids(aug_scanned_edges[:, [0, 1]])
        _, idx_vj = np.unique(aug_scanned_edges[:, [0, 2]],
                              axis=0,
                              return_inverse=True)
        idx_vi = np.expand_dims(np.array(idx_vi, dtype='int32'), 1)
        idx_vj = np.expand_dims(np.array(idx_vj, dtype='int32'), 1)
        aug_scanned_edges = np.concatenate([aug_scanned_edges, idx_vi, idx_vj],
                                           axis=1)

        if tc is not None:
            tc['union_e'] += time.time() - t0
        # aug_scanned_edges: n_aug_scanned_edges x 6, (eg_idx, vi, vj, rel, idx_vi, idx_vj) sorted by (eg_idx, vi, vj)
        return aug_scanned_edges, new_idx_for_edges_y, rest_idx
예제 #8
0
 def preprocess_data(self, data, shuffle, bucketing):
     self._append_eos(data)
     data = self._get_batches(data, shuffle=shuffle, bucketing=bucketing)
     res = []
     for batch in data:
         texts, keyword_pos = zip(*batch)
         lens = torch.as_tensor([len(s) for s in texts], dtype=torch.long)
         if keyword_pos[0] is None:
             keywords = keyword_pos = None
         else:
             keywords = torch.as_tensor([t[p] for t, p in batch],
                                        dtype=torch.long)
             keyword_pos = torch.as_tensor(keyword_pos, dtype=torch.long)
         fwd_tgt = self._pad_batch_sequence(texts)
         bwd_src = fwd_tgt.flip(0)
         res.append([[fwd_tgt, bwd_src], lens, keywords, keyword_pos])
         if self.opts.need_segment_ids:
             fwd_seg_ids = get_segment_ids(
                 fwd_tgt,
                 self.SEP_token,
                 0,
                 relative=self.opts.segment_emb_relative,
                 sep_as_new_segment=self.opts.sep_as_new_segment)
             bwd_seg_ids = fwd_seg_ids.flip(0)
             res[-1].append([fwd_seg_ids, bwd_seg_ids])
         if self.opts.fwd_need_remain_syllables or self.opts.bwd_need_remain_syllables:
             fwd_rem_syl = None
             if self.opts.fwd_need_remain_syllables:
                 fwd_rem_syl = get_remain_syllables(self.word2syllable,
                                                    self.SEP_token,
                                                    decoder_target=fwd_tgt)
             bwd_rem_syl = None
             if self.opts.bwd_need_remain_syllables:
                 bwd_rem_syl = get_remain_syllables(self.word2syllable,
                                                    self.SEP_token,
                                                    decoder_input=bwd_src)
             res[-1].append([fwd_rem_syl, bwd_rem_syl])
     return res