Пример #1
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.dir}/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'
        done_flag_path = f'{dataset.dir}/full_feat_done.txt'
        if not osp.exists(done_flag_path):  # Will take ~3 hours...
            t = time.perf_counter()
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            dim_chunk_size = 64
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat
            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            for i in tqdm(range(0, dataset.num_papers, node_chunk_size)):
                j = min(i + node_chunk_size, dataset.num_papers)
                x[i:j] = paper_feat[i:j]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=row,
                                 col=col,
                                 sparse_sizes=(dataset.num_authors,
                                               dataset.num_papers),
                                 is_sorted=True)

            # Processing 64-dim subfeatures at a time for memory efficiency.
            print('Generating author features...')
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers,
                               end_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=col,
                                 col=row,
                                 sparse_sizes=(dataset.num_institutions,
                                               dataset.num_authors),
                                 is_sorted=False)

            print('Generating institution features...')
            # Processing 64-dim subfeatures at a time for memory efficiency.
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(x,
                                       start_row_idx=dataset.num_papers,
                                       end_row_idx=dataset.num_papers +
                                       dataset.num_authors,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               end_row_idx=N,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

            with open(done_flag_path, 'w') as f:
                f.write('done')
Пример #2
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_feat.npy'
        # indicate whether full_feat processing has been finished or not
        done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt'
        if not osp.exists(
                done_flag_path):  # Will take approximately 3 hours...
            if os.path.exists(path):
                print('Removing unfinished full_feat.npy')
                os.remove(path)

            try:
                t = time.perf_counter()
                print('Generating full feature matrix...')

                N = (dataset.num_papers + dataset.num_authors +
                     dataset.num_institutions)

                x = np.memmap(path,
                              dtype=np.float16,
                              mode='w+',
                              shape=(N, self.num_features))
                paper_feat = dataset.paper_feat
                dim_chunk = 64
                chunk = 100000

                print('Copying paper features...')
                for i in tqdm(range(0, dataset.num_papers,
                                    chunk)):  # Copy paper features.
                    end_idx = min(i + chunk, dataset.num_papers)
                    x[i:end_idx] = paper_feat[i:end_idx]

                edge_index = dataset.edge_index('author', 'writes', 'paper')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=row,
                                     col=col,
                                     sparse_sizes=(dataset.num_authors,
                                                   dataset.num_papers),
                                     is_sorted=True)

                print('Generating author features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(paper_feat,
                                      start_row_idx=0,
                                      end_row_idx=len(paper_feat),
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers,
                                   end_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                edge_index = dataset.edge_index('author', 'institution')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=col,
                                     col=row,
                                     sparse_sizes=(dataset.num_institutions,
                                                   dataset.num_authors),
                                     is_sorted=False)

                print('Generating institution features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(x,
                                      start_row_idx=dataset.num_papers,
                                      end_row_idx=dataset.num_papers +
                                      dataset.num_authors,
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   end_row_idx=N,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                x.flush()
                del x
                print(f'Done! [{time.perf_counter() - t:.2f}s]')

                with open(done_flag_path, 'w') as f:
                    f.write('done')

            except Exception:
                traceback.print_exc()
                if os.path.exists(path):
                    print(
                        'Removing unfinished full feat file due to exception')
                    os.remove(path)
                exit(-1)