def process(self): idx = self.categories[self.category] paths = [osp.join(path, idx) for path in self.raw_paths] datasets = [] for path in zip(paths[::2], paths[1::2]): pos_paths = sorted(glob.glob(osp.join(path[0], '*.pts'))) y_paths = sorted(glob.glob(osp.join(path[1], '*.seg'))) data_list = [] for path in zip(pos_paths, y_paths): pos = read_txt_array(path[0]) y = read_txt_array(path[1], dtype=torch.long) data = Data(y=y, pos=pos) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) datasets.append(data_list) makedirs(osp.join(self.processed_dir, self.category)) train_data, train_slices = self.collate(datasets[0] + datasets[1]) test_data, test_slices = self.collate(datasets[2]) _, train_data.y = train_data.y.unique(return_inverse=True) _, test_data.y = test_data.y.unique(return_inverse=True) torch.save((train_data, train_slices), self.processed_paths[0]) torch.save((test_data, test_slices), self.processed_paths[1])
def process_raw_path(self, data_path, label_path): y_offset = 0 data_list = [] cat_ys = [] for cat_idx, cat in enumerate(self.categories): idx = self.category_ids[cat] point_paths = sorted(glob.glob(osp.join(data_path, idx, '*.pts'))) y_paths = sorted(glob.glob(osp.join(label_path, idx, '*.seg'))) points = [read_txt_array(path) for path in point_paths] ys = [read_txt_array(path, dtype=torch.long) for path in y_paths] lens = [y.size(0) for y in ys] y = torch.cat(ys).unique(return_inverse=True)[1] + y_offset cat_ys.append(y.unique()) y_offset = y.max().item() + 1 ys = y.split(lens) for (pos, y) in zip(points, ys): data = Data(y=y, pos=pos, category=cat_idx) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) y_mask = torch.zeros((len(self.categories), y_offset), dtype=torch.uint8) for i in range(len(cat_ys)): y_mask[i, cat_ys[i]] = 1 return data_list, y_mask
def process(self): data_splits = [[], [], []] # Corresponds to train/val/test cat_names = list(self.categories.keys()) category_infos = {} cat_sid = 0 for cid, cat in enumerate(cat_names): infos = {'category_id':cid} paths = [osp.join(path, self.categories[cat])for path in self.raw_paths] print('Processing category {}'.format(cat)) # Loop over train/val/test split ymax = 0 # Largest label value of this category sample_nums = [] for split, path in enumerate(zip(paths[::2], paths[1::2])): pos_paths = sorted(glob.glob(osp.join(path[0], '*.pts'))) y_paths = sorted(glob.glob(osp.join(path[1], '*.seg'))) # Gather all category split samples data_list = [] for path in zip(pos_paths, y_paths): pos = read_txt_array(path[0]) y = read_txt_array(path[1], dtype=torch.long) y = self.reindex_labels(y, cat_sid) ymax = max(ymax, y.max()) data = Data(y=y, pos=pos, cid=cid) # Data preprocessing if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) sample_nums.append(len(data_list)) data_splits[split] += data_list # Merge splits data across categories cat_eid = ymax.item() infos['part_id_min'] = cat_sid infos['part_id_max'] = cat_eid infos['part_num'] = cat_eid - cat_sid + 1 infos['sample_num'] = sample_nums category_infos[cat] = infos print('>Category:{} Infos:{}'.format(cat, infos)) cat_sid = cat_eid + 1 # Update category start index train_data, train_slices = self.collate(data_splits[0]) val_data, val_slices = self.collate(data_splits[1]) test_data, test_slices = self.collate(data_splits[2]) # Save processed data torch.save((train_data, train_slices), self.processed_paths[0]) torch.save((val_data, val_slices), self.processed_paths[1]) torch.save((test_data, test_slices), self.processed_paths[2]) torch.save(category_infos, osp.join(self.processed_dir, 'all_infos.pt'))
def process(self): data_list = [] for cat in self.categories: paths = glob.glob(osp.join(self.raw_dir, '{}*.tri'.format(cat))) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: pos = read_txt_array('{}.vert'.format(path)) face = read_txt_array('{}.tri'.format(path), dtype=torch.long) face = face.t().contiguous() - 1 random_list = sorted( random.sample(range(int(face.size(1) - 1)), self.num)) face = face[:, random_list] face_set_a = set(face[0, :]) face_set_b = set(face[1, :]) face_set_c = set(face[2, :]) pos_list = sorted(list(face_set_a | face_set_b | face_set_c)) for i, _ in enumerate(pos_list): pos_list[i] = int(pos_list[i]) dict_key = {} for i, j in enumerate(pos_list): dict_key[int(j)] = i for i in range(face.size(1)): face[0][i] = dict_key[int(face[0][i])] face[1][i] = dict_key[int(face[1][i])] face[2][i] = dict_key[int(face[2][i])] pos = pos[pos_list] assert pos.size(1) == 3 and face.size(0) == 3 edge_index = torch.cat([face[:2], face[1:], face[::2]], dim=1) edge_index = to_undirected(edge_index, num_nodes=self.num) data = Data(pos=pos, face=face, edge_index=edge_index, y=self.categories.index(cat)) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def process_graph(self, triple_path, feature_path, embeddings): g1 = read_txt_array(triple_path, sep='\t', dtype=torch.long) subj, rel, obj = g1.t() x_dict = {} with open(feature_path, 'r') as f: for line in f: info = line.strip().split('\t') info = info if len(info) == 2 else info + ['**UNK**'] seq = info[1].lower().split() hs = [embeddings.get(w, embeddings['**UNK**']) for w in seq] x_dict[int(info[0])] = torch.stack(hs, dim=0) idx = torch.tensor(list(x_dict.keys())) assoc = torch.full((idx.max().item() + 1, ), -1, dtype=torch.long) assoc[idx] = torch.arange(idx.size(0)) subj, obj = assoc[subj], assoc[obj] edge_index = torch.stack([subj, obj], dim=0) edge_index, rel = sort_edge_index(edge_index, rel) xs = [None for _ in range(idx.size(0))] for i in x_dict.keys(): xs[assoc[i]] = x_dict[i] x = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True) return x, edge_index, rel, assoc
def process(self): data_list = [] for off_path in glob.glob('{}/*.off'.format(self.raw_paths[0])): data = read_off(off_path) shape_id = osp.basename(off_path).rsplit('.', 1)[0] label_path = osp.join(self.raw_paths[1], shape_id + '.seg') data.y = read_txt_array(label_path) - 1 # start from 0 if self.classification is not None: if self.classification in data.y: data.y = torch.tensor([1]) else: data.y = torch.tensor([0]) data.shape_id = torch.tensor([int(shape_id)]) # if int(shape_id) < 10: data_list.append(data) if self.pre_filter is not None: data_list = [d for d in data_list if self.pre_filter(d)] if self.pre_transform is not None: data_list = [self.pre_transform(d) for d in data_list] val_split = 0.15 val_size = int(len(data_list) * val_split) torch.save(self.collate(data_list[:len(data_list) - val_size]), self.processed_paths[0]) torch.save(self.collate(data_list[len(data_list) - val_size:]), self.processed_paths[1])
def process_events(self): events = [] for path in self.raw_paths: data = read_txt_array(path, sep='\t', end=4, dtype=torch.long) data[:, 3] = data[:, 3] / 24 events += [data] return torch.cat(events, dim=0)
def process(self): data_list = [] splits = [0] for raw_path in self.raw_paths: srot = read_txt_array(raw_path, sep='\t', end=4, dtype=torch.long) row, rel, col, time = srot.t().contiguous() time = time / self.granularity count = time.bincount() split_sections = count[count > 0].tolist() rows = row.split(split_sections) cols = col.split(split_sections) rels = rel.split(split_sections) times = time.split(split_sections) splits.append(splits[-1] + len(rows)) for row, col, rel, time in zip(rows, cols, rels, times): edge_index = torch.stack([row, col], dim=0) data = Data(edge_index=edge_index, edge_type=rel, time=time) if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0]) torch.save(splits, self.processed_paths[1])
def read_file(folder, prefix, name): path = osp.join(folder, 'ind.{}.{}'.format(prefix.lower(), name)) if name == 'test.index': return read_txt_array(path, dtype=torch.long) with open(path, 'rb') as f: if sys.version_info > (3, 0): out = pickle.load(f, encoding='latin1') else: out = pickle.load(f) if name == 'graph': return out out = out.todense() if hasattr(out, 'todense') else out print('If input x has nan or inf', np.isinf(out).any(), np.isnan(out).any()) # for fast training, we discard one-hot encoding and use 32 dimension vector from gaussian distribution if prefix == 'ddi_constraint' or prefix == 'decagon': if name == 'allx': transformer = random_projection.GaussianRandomProjection( n_components=32) out = transformer.fit_transform(out) out = torch.FloatTensor(out) return out
def process(self): data_list = [] for cat in self.categories: paths = glob.glob(osp.join(self.raw_dir, '{}*.tri'.format(cat))) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: pos = read_txt_array('{}.vert'.format(path)) face = read_txt_array('{}.tri'.format(path), dtype=torch.long) data = Data(pos=pos, face=face.t().contiguous()) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def process(self): idx = self.categories[self.category] paths = [osp.join(path, idx) for path in self.raw_paths] datasets = [] for path in zip(paths[::2], paths[1::2]): pos_paths = sorted(glob.glob(osp.join(path[0], '*.pts'))) y_paths = sorted(glob.glob(osp.join(path[1], '*.seg'))) data_list = [] for path in zip(pos_paths, y_paths): pos = read_txt_array(path[0]) y = read_txt_array(path[1], dtype=torch.long) data = Data(y=y, pos=pos) if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) datasets.append(data_list) makedirs(osp.join(self.processed_dir, self.category)) data_list = datasets[0] + datasets[1] torch.save(self.collate(data_list), self.processed_paths[0]) torch.save(self.collate(datasets[2]), self.processed_paths[1])
def process_raw_path(self, data_path, label_path): y_offset = 0 data_list = [] for category in self.categories: idx = self.category_ids[category] point_paths = sorted(glob.glob(osp.join(data_path, idx, '*.pts'))) y_paths = sorted(glob.glob(osp.join(label_path, idx, '*.seg'))) points = [read_txt_array(path) for path in point_paths] ys = [read_txt_array(path, dtype=torch.long) for path in y_paths] lens = [y.size(0) for y in ys] y = torch.cat(ys).unique(return_inverse=True)[1] + y_offset y_offset = y.max().item() + 1 ys = y.split(lens) for (pos, y) in zip(points, ys): data = Data(y=y, pos=pos) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) return data_list
def process(self): path_file = self.raw_paths with open(path_file[0], "r") as f: filenames = f.read().split('\n')[:-1] data_list = [] for filename in filenames: pos_path = osp.join(self.raw_dir, filename + '.xyz') normal_path = osp.join(self.raw_dir, filename + '.normals') curv_path = osp.join(self.raw_dir, filename + '.curv') idx_path = osp.join(self.raw_dir, filename + '.pidx') pos = read_txt_array(pos_path) normals = read_txt_array(normal_path) curv = read_txt_array(curv_path) normals_and_curv = torch.cat([normals, curv], dim=1) test_idx = read_txt_array(idx_path, dtype=torch.long) data = Data(pos=pos, x=normals_and_curv) data.test_idx = test_idx if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), self.processed_paths[0])
def read_file(folder, prefix, name): path = osp.join(folder, 'ind.{}.{}'.format(prefix.lower(), name)) if name == 'test.index': return read_txt_array(path, dtype=torch.long) with open(path, 'rb') as f: if sys.version_info > (3, 0): out = pickle.load(f, encoding='latin1') else: out = pickle.load(f) if name == 'graph': return out out = out.todense() if hasattr(out, 'todense') else out out = torch.Tensor(out) return out
def process(self): ref_data = read_off( osp.join(self.raw_paths[0], 'null', '{}.off'.format(self.cat))) train_list = [] name = '{}_{}_*.off'.format(self.part, self.cat) paths = glob.glob(osp.join(self.raw_paths[0], self.part, name)) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: data = read_off('{}.off'.format(path)) y = read_txt_array('{}.baryc_gt'.format(path)) data.y = y[:, 0].to(torch.long) - 1 data.y_baryc = y[:, 1:] train_list.append(data) test_list = [] name = '{}_{}_*.off'.format(self.part, self.cat) paths = glob.glob(osp.join(self.raw_paths[1], self.part, name)) paths = [path[:-4] for path in paths] paths = sorted(paths, key=lambda e: (len(e), e)) for path in paths: test_list.append(read_off('{}.off'.format(path))) if self.pre_filter is not None: train_list = [d for d in train_list if self.pre_filter(d)] test_list = [d for d in test_list if self.pre_filter(d)] if self.pre_transform is not None: ref_data = self.pre_transform(ref_data) train_list = [self.pre_transform(d) for d in train_list] test_list = [self.pre_transform(d) for d in test_list] torch.save(ref_data, self.processed_paths[0]) torch.save(self.collate(train_list), self.processed_paths[1]) torch.save(self.collate(test_list), self.processed_paths[2])
def read_file(folder, prefix, name, dtype=None): path = osp.join(folder, '{}_{}.txt'.format(prefix, name)) return read_txt_array(path, sep=',', dtype=dtype)
def process_y(self, path, assoc1, assoc2): row, col, mask = read_txt_array(path, sep='\t', dtype=torch.long).t() mask = mask.to(torch.uint8) return torch.stack([assoc1[row[mask]], assoc2[col[mask]]], dim=0)