class GraphLoader(object):
    def __init__(self,
                 data_path,
                 feature_meta,
                 test_p,
                 gnx_idx=None,
                 cuda_num=None,
                 logger=None):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._gnx_idx = gnx_idx
        self._cuda_num = cuda_num
        self._test_p = test_p
        self._features_meta = feature_meta
        self._data_path = data_path

        # self._logger.debug("Loading %s dataset...", self._dataset)
        # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb"))
        # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb"))
        # self._nodes_order = sorted(self._gnx)
        self._train_set = self._test_set = self._train_idx = self._test_idx = None
        self._inputs = self._targets = None
        self._nodes_order = []

        self._labels = {i: label for i, label in enumerate(self._get_labels())}
        self._prepare_data()

    def _get_labels(self):
        gnx = pickle.load(
            open(os.path.join(next(self._get_gnx_paths()), "gnx.pkl"), "rb"))
        return gnx.graph["node_labels"]

    @staticmethod
    def _encode_onehot_gnx1(gnx,
                            nodes_order):  # gnx, nodes_order: list = None):
        labels = gnx.graph["node_labels"]
        labels_dict = {
            c: np.identity(len(labels))[i, :]
            for i, c in enumerate(labels)
        }
        labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)})
        return np.array(list(
            map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)),
                        dtype=np.int32)

    def _encode_onehot_gnx(self, gnx,
                           nodes_order):  # gnx, nodes_order: list = None):
        # labels = gnx.graph["node_labels"]
        ident = np.identity(len(self._labels))
        labels_dict = {label: ident[j, :] for j, label in self._labels.items()}
        # labels_dict = {c: np.identity(len(self._labels))[i, :] for i, c in enumerate(labels)}
        # labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)})
        return np.array(list(
            map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)),
                        dtype=np.int32)

    def _join_graphs1(self):
        all_nodes = set()
        common_nodes = None
        for path in self._get_gnx_paths():
            gnx = pickle.load(open(os.path.join(path, "orig_gnx.pkl"), "rb"))
            all_nodes = all_nodes.union(gnx)
            if common_nodes is None:
                common_nodes = set(gnx)
            else:
                common_nodes = common_nodes.intersection(gnx)

        pickle.dump(
            all_nodes,
            open(os.path.join(path, "..", "..", "all_nodes.pkl"), "wb"))
        pickle.dump(
            common_nodes,
            open(os.path.join(path, "..", "..", "common_nodes.pkl"), "wb"))
        return all_nodes, common_nodes

    def _join_graphs(self):
        path = next(self._get_gnx_paths())
        all_nodes = pickle.load(
            open(os.path.join(path, "..", "..", "all_nodes.pkl"), "rb"))
        common_nodes = pickle.load(
            open(os.path.join(path, "..", "..", "common_nodes.pkl"), "rb"))
        return all_nodes, common_nodes

    def _split_data(self):
        feat_path = os.path.join(next(self._get_gnx_paths()), "features_0")
        gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
        self._nodes_order = sorted(
            [node for node in gnx if gnx.node[node]['label'] is not None])
        indexes = [(i, node) for i, node in enumerate(self._nodes_order)]

        idx, nodes = zip(*indexes)
        c_train, c_test, c_train_idx, c_test_idx = train_test_split(
            nodes, idx, test_size=self._test_p, shuffle=True)

        self._train_set = set(c_train)
        self._test_set = set(c_test)
        self._test_idx = np.array(c_test_idx)
        self._train_idx = np.array(c_train_idx)

    def _split_data_orig(self):
        all_nodes, common_nodes = self._join_graphs()

        self._nodes_order = sorted(all_nodes)
        indexes = [(i, node) for i, node in enumerate(self._nodes_order)]
        common, uncommon = [], []
        for i, node in indexes:
            cur_list = common if node in common_nodes else uncommon
            cur_list.append((i, node))
        c_idx, c_nodes = zip(*common)
        c_train, c_test, c_train_idx, c_test_idx = train_test_split(
            c_nodes, c_idx, test_size=self._test_p, shuffle=True)
        uc_idx, uc_nodes = zip(*uncommon)
        uc_train, uc_test, uc_train_idx, uc_test_idx = train_test_split(
            uc_nodes, uc_idx, test_size=self._test_p, shuffle=True)

        self._train_set = set(c_train).union(uc_train)
        self._test_set = set(c_test).union(uc_test)
        self._test_idx = np.array(c_test_idx + uc_test_idx)
        self._train_idx = np.array(c_train_idx + uc_train_idx)

    def _activate_cuda(self, *args):
        if self._cuda_num is None:
            return args
        return [x.cuda(self._cuda_num) for x in args]

    # firms/years/features_0-1
    def _get_gnx_paths(self):
        paths = sorted(os.listdir(self._data_path), key=int)
        if self._gnx_idx is not None:
            # for x in [4, 6]:
            #     yield os.path.join(self._data_path, paths[x])
            yield os.path.join(self._data_path, paths[self._gnx_idx])
            return
        for path in paths:
            yield os.path.join(self._data_path, path)

    def _prepare_data1(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            cur_data = pickle.load(
                open(os.path.join(feat_path, "data.pkl"), "rb"))
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            cur_labels = pickle.load(
                open(os.path.join(feat_path, "labels.pkl"), "rb"))
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
        self._inputs = self._inputs.transpose((0, 2, 1))
        self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")

    def _prepare_data(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
            gnx = gnx.subgraph(self._nodes_order)

            features = GraphFeatures(gnx,
                                     self._features_meta,
                                     dir_path=feat_path,
                                     logger=self._logger)
            features.build(include=self._train_set)

            add_ones = bool(
                set(self._features_meta).intersection(
                    ["first_neighbor_histogram", "second_neighbor_histogram"]))
            cur_data = features.to_matrix(add_ones=add_ones,
                                          dtype=np.float32,
                                          mtype=np.array,
                                          should_zscore=True)
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            pickle.dump(cur_data,
                        open(os.path.join(feat_path, "data.pkl"), "wb"))

            cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order)
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
            pickle.dump(cur_labels,
                        open(os.path.join(feat_path, "labels.pkl"), "wb"))

        # Arranging data as <batch, seq, feature>
        if self._gnx_idx is None:
            self._inputs = self._inputs.transpose((0, 2, 1))
            self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")
        # topo_x = torch.FloatTensor(topo_x)  # np.array(features.todense()))
        #
        # labels = torch.LongTensor(np.where(labels)[1])
        #
        # train_idx = torch.LongTensor(self._train_idx)
        # test_idx = torch.LongTensor(self._test_idx)
        #
        # topo_x, labels = convert_to_variable(topo_x, labels)
        # return self.activate_cuda([topo_x, labels])

    def _load_data(self, indexes, nbatch):
        # for inputs, targets in zip(self._inputs, self._targets):
        inputs, targets = self._inputs[indexes], self._targets[indexes]
        # for i in range(0, int(len(inputs) / nbatch) * nbatch, nbatch):
        for i in range(0, len(inputs), nbatch):
            data, labels = inputs[i:i + nbatch], targets[i:i + nbatch]
            # if self._gnx_idx is not None:
            #     data, labels = data[:, self._gnx_idx, :], labels[:, self._gnx_idx, :]
            data, labels = Variable(torch.FloatTensor(data)), Variable(
                torch.LongTensor(np.where(labels)[self.feat_dim]))
            # labels = labels[labels != reverse_labels[None]]
            yield self._activate_cuda(data, labels)

    def load_train_data(self, nbatch):
        return self._load_data(self._train_idx, nbatch)

    def load_test_data(self, nbatch):
        return self._load_data(self._test_idx, nbatch)

    @property
    def feat_dim(self):
        return 2 if self._gnx_idx is None else 1

    @property
    def num_nodes(self):
        return self._inputs.shape[0]

    @property
    def sequence_len(self):
        return self._inputs.shape[1]

    @property
    def num_features(self):
        return self._inputs.shape[self.feat_dim]

    @property
    def num_labels(self):
        return len(self._labels)

    @property
    def labels(self):
        return self._labels

    @property
    def num_layers(self):
        return [100, 20]
示例#2
0
class GraphLoader(object):
    def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._paths = paths
        self._ignore_index = ignore_index
        self._cuda_num = cuda_num
        self._dtype = dtype

        self._logger.debug("Loading %s dataset...", paths["features"])
        self._gnx = pickle.load(open(paths["gnx"], "rb"))
        self._is_max_connected = is_max_connected
        if is_max_connected:
            self._gnx = get_max_subgraph(self._gnx)

        self.ordered_nodes = sorted(self._gnx)
        self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n])
        # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]]
        self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])}
        self._node_labels = self._get_node_labels()

        self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0]))
        bow_mx = np.vstack(self._content.values()).astype(DTYPE)
        median_bow = np.median(bow_mx, axis=0)
        bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE)

        self._bow_mx = z_scoring(bow_mx)
        self._topo_mx = None

        # Adjacency matrices
        adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE)
        self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj)
        self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense()
        self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj)
        self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense()

        self._train_set = self._test_set = None
        self._train_idx = self._test_idx = self._base_train_idx = None
        self._val_idx = None

    @property
    def name(self):
        return str(self._paths["name"])

    @property
    def is_graph_directed(self):
        return self._gnx.is_directed()

    # def _activate_cuda(self, items):
        # return items

    # def _encode_onehot_gnx(self):  # gnx, nodes_order: list = None):
    #     labels = self._labels.copy()
    #     if labels[len(labels) - 1] is not None:
    #         labels[len(labels)] = None
    #     ident = np.identity(len(labels))
    #     if self._gnx.graph.get('is_index_labels', False):
    #         labels_dict = {i: ident[i, :] for i, label in labels.items()}
    #     else:
    #         labels_dict = {label: ident[i, :] for i, label in labels.items()}
    #     return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self._nodes_order)),
    #                     dtype=np.int32)

    def _get_node_labels(self):
        labels = self._labels.copy()
        labels[self._ignore_index] = None
        labels_dict = {label: i for i, label in labels.items()}
        return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self.ordered_nodes)),
                        dtype=np.int32)

    def set_variables(self, **kwargs):
        for key, val in kwargs.items():
            self.__setattr__(key, val)

    @property
    def num_labels(self):
        return len(self._labels)

    # @property
    # def labels(self):
    #     labels = torch.LongTensor(self._node_labels)
    #     return activate_cuda(labels, cuda_num=self._cuda_num)

    @property
    def labels(self):
        labels = torch.LongTensor(self._node_labels)
        return activate_cuda(labels, cuda_num=self._cuda_num)

    @property
    def distinct_labels(self):
        return sorted(self._labels.keys())

    # def _get_idx(self, idx_name):
    #     return torch.LongTensor([x for x in getattr(self, idx_name) if x in set(self._labeled_nodes)])
    #
    # @property
    # def train_idx(self):
    #     return activate_cuda(self._get_idx("_train_idx"), cuda_num=self._cuda_num)
    #
    # @property
    # def val_idx(self):
    #     return activate_cuda(self._get_idx("_val_idx"), cuda_num=self._cuda_num)
    #
    # @property
    # def test_idx(self):
    #     return activate_cuda(self._get_idx("_test_idx"), cuda_num=self._cuda_num)

    @property
    def bow_mx(self):
        # bow_feat = torch.FloatTensor(self._bow_mx)
        bow_feat = torch.DoubleTensor(self._bow_mx)
        return activate_cuda(bow_feat, cuda_num=self._cuda_num)

    @property
    def topo_mx(self):
        assert self._topo_mx is not None, "Split train required"
        # topo_feat = torch.FloatTensor(self._topo_mx)
        topo_feat = torch.DoubleTensor(self._topo_mx)
        return activate_cuda(topo_feat, cuda_num=self._cuda_num)

    @property
    def adj_rt_mx(self):
        return activate_cuda(self._adj_rt, cuda_num=self._cuda_num)  # .clone())

    @property
    def adj_mx(self):
        return activate_cuda(self._adj, cuda_num=self._cuda_num).type(self._dtype)  # .clone())

        # def split_test(self, test_p):
        #     indexes, nodes = zip(*self._labeled_nodes)
        #     self._train_set, _, self._base_train_idx, self._test_idx = train_test_split(nodes, indexes, test_size=test_p)

        # def split_train(self, train_p, features_meta):
        #     train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx,
        #                                                                           test_size=1 - train_p)
        #     feat_path = os.path.join(self._feat_path, "features%d" % (self._is_max_connected,))
        # features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
        #                          is_max_connected=False)  # Already taking the max sub_graph in init
        # features.build(include=set(train_set), should_dump=False)
        #
        # add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        # self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)
        #
        # ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        # self._topo_mx /= ratio

    def set_train(self, train_set, features_meta):
        features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
                                 is_max_connected=False)  # Already taking the max sub_graph in init
        features.build(include=set(train_set), should_dump=True)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
示例#3
0
class GraphFeatures(dict):
    def __init__(self,
                 gnx,
                 features,
                 dir_path,
                 logger=None,
                 is_max_connected=False):
        self._base_dir = dir_path
        self._logger = EmptyLogger() if logger is None else logger
        self._matrix = None

        self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx

        self._abbreviations = {
            abbr: name
            for name, meta in features.items() for abbr in meta.abbr_set
        }

        # building the feature calculators data structure
        super(GraphFeatures, self).__init__({
            name: meta.calculator(self._gnx, logger=logger)
            for name, meta in features.items()
        })

    @property
    def graph(self):
        return self._gnx

    def _build_serially(self,
                        include,
                        force_build: bool = False,
                        dump_path: str = None):
        if VERBOSE:
            self._logger.debug("Start building graph features")
        if dump_path is not None and self._gnx is not None:
            pickle.dump(self._gnx,
                        open(self._feature_path("gnx", dump_path), "wb"))
        for name, feature in self.items():
            if force_build or not os.path.exists(self._feature_path(name)):
                is_dumped = dump_path is not None and feature.DUMPABLE
                msg = "Dumped to: %s" % dump_path if is_dumped else "Not dumped"
                feature.build(include=include, msg=msg)
                if is_dumped:
                    self._dump_feature(name, feature, dump_path)
            else:
                self._load_feature(name)
        if VERBOSE:
            self._logger.debug("Finished building graph features")

    # a single process means it is calculated serially
    def build(self,
              num_processes: int = 1,
              include: set = None,
              should_dump: bool = False):  # , exclude: set=None):
        # if exclude is None:
        #     exclude = set()
        if include is None:
            include = set()

        if 1 == num_processes:
            dump_path = None
            if should_dump:
                dump_path = self._base_dir
                if not os.path.exists(dump_path):
                    os.makedirs(dump_path)
            return self._build_serially(include, dump_path=dump_path)

        request_queue = Queue()
        workers = [
            Worker(request_queue, self, include, logger=self._logger)
            for _ in range(num_processes)
        ]
        # Starting all workers
        for worker in workers:
            worker.start()

        # Feeding the queue with all the features
        for feature_name in self:
            request_queue.put(feature_name)

        # Sentinel objects to allow clean shutdown: 1 per worker.
        for _ in range(num_processes):
            request_queue.put(None)

        # Joining all workers
        for worker in workers:
            worker.join()

    def _load_feature(self, name):
        if self._gnx is None:
            assert os.path.exists(self._feature_path(
                "gnx")), "Graph is not present in the given directory"
            self._gnx = pickle.load(open(self._feature_path("gnx"), "rb"))
        feature = pickle.load(open(self._feature_path(name), "rb"))
        feature.load_meta({
            name: getattr(self, name)
            for name in FeatureCalculator.META_VALUES
        })
        self[name] = feature
        return self[name]

    def __getattr__(self, name):
        if name not in self:
            if name in self._abbreviations:
                name = self._abbreviations[name]
            else:
                return super(GraphFeatures, self).__getattribute__(name)

        # if obj is already calculated - return it
        obj = self[name]
        if obj.is_loaded:
            return obj

        # if obj is not calculated, check if it exist on the file system
        # if it doesn't - calculate it, if it does - load it and return it
        if not os.path.exists(self._feature_path(name)):
            obj.build()
            return obj

        return self._load_feature(name)

    @property
    def features(self):
        return set(self)

    def _feature_path(self, name, dir_path=None):
        if dir_path is None:
            dir_path = self._base_dir
        return os.path.join(dir_path, name + ".pkl")

    def _dump_feature(self, name, feature, dir_path):
        if feature.is_loaded:
            prev_meta = feature.clean_meta(
            )  # in order not to save unnecessary data
            pickle.dump(feature, open(self._feature_path(name, dir_path),
                                      "wb"))
            feature.load_meta(prev_meta)

    def dump(self, dir_path=None):
        if dir_path is None:
            dir_path = self._base_dir

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        for name, feature in self.items():
            self._dump_feature(name, feature, dir_path)

    @property
    def shape(self):
        sorted_features = map(at(1), sorted(self.items(), key=at(0)))
        sorted_features = [
            feature for feature in sorted_features
            if feature.is_relevant() and feature.is_loaded
        ]
        res = []
        for feature in sorted_features:
            res.append((feature.print_name()), feature.shape[1])
        return res

    # sparse.csr_matrix(matrix, dtype=np.float32)
    def to_matrix(self,
                  entries_order: list = None,
                  add_ones=False,
                  dtype=None,
                  mtype=np.matrix,
                  should_zscore: bool = True):
        if entries_order is None:
            entries_order = sorted(self._gnx)

        sorted_features = map(at(1), sorted(self.items(), key=at(0)))
        # Consider caching the matrix creation (if it takes long time)
        sorted_features = [
            feature for feature in sorted_features
            if feature.is_relevant() and feature.is_loaded
        ]

        if sorted_features:
            mx = np.hstack([
                feature.to_matrix(entries_order,
                                  mtype=mtype,
                                  should_zscore=should_zscore)
                for feature in sorted_features
            ])
            if add_ones:
                mx = np.hstack([mx, np.ones((mx.shape[0], 1))])
            mx.astype(dtype)
        else:
            mx = np.matrix([])

        return mtype(mx)

    def to_dict(self, dtype=None, should_zscore: bool = True):
        mx = self.to_matrix(dtype=dtype,
                            mtype=np.matrix,
                            should_zscore=should_zscore)
        return {node: mx[i, :] for i, node in enumerate(sorted(self._gnx))}
示例#4
0
class GraphLoader(object):
    def __init__(self, data_dir, features_meta, is_max_connected=False, cuda_num=None, logger=None):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._data_path = data_dir
        self._cuda_num = cuda_num
        self._features_meta = features_meta
        self._is_max_connected = is_max_connected

        self._logger.debug("Loading %s dataset...", self._data_path)
        features_path = self._features_path()
        gpath = os.path.realpath(os.path.join(features_path, "..", "gnx.pkl"))
        self._gnx = pickle.load(open(gpath, "rb"))

        self._nodes_order = sorted(self._gnx)
        self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])}
        self._ident_labels = self._encode_onehot_gnx()

        self._content = pickle.load(open(os.path.join(self._data_path, "content.pkl"), "rb"))
        bow_mx = np.vstack([self._content[node] for node in self._nodes_order]).astype(DTYPE)
        self._bow_mx = normalize(bow_mx)
        self._topo_mx = None

        # Adjacency matrices
        adj = nx.adjacency_matrix(self._gnx, nodelist=self._nodes_order).astype(DTYPE)
        self._adj = handle_matrix_symmetric(adj)
        self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense()
        self._adj_rt = handle_matrix_concat(adj, should_normalize=True)
        self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense()

        self._train_set = self._test_set = None
        self._train_idx = self._test_idx = self._base_train_idx = None
        self._val_idx = None

    def _activate_cuda(self, *items):
        if self._cuda_num is None:
            return items
        if 1 == len(items):
            return items[0].cuda(self._cuda_num)
        return [x.cuda(self._cuda_num) for x in items]

    def _encode_onehot_gnx(self):  # gnx, nodes_order: list = None):
        ident = np.identity(len(self._labels))
        if self._gnx.graph.get('is_index_labels', False):
            labels_dict = {label: ident[i, :] for i, label in self._labels.items()}
        else:
            labels_dict = {i: ident[i, :] for i, label in self._labels.items()}
        return np.array(list(map(lambda n: labels_dict[self._gnx.node[n]['label']], self._nodes_order)), dtype=np.int32)

    @property
    def num_labels(self):
        return len(self._labels)

    @property
    def labels(self):
        labels = torch.LongTensor(np.where(self._ident_labels)[1])
        return self._activate_cuda(labels)

    @property
    def train_idx(self):
        train_idx = torch.LongTensor(self._train_idx)
        return self._activate_cuda(train_idx)

    @property
    def val_idx(self):
        val_idx = torch.LongTensor(self._val_idx)
        return self._activate_cuda(val_idx)

    @property
    def test_idx(self):
        test_idx = torch.LongTensor(self._test_idx)
        return self._activate_cuda(test_idx)

    @property
    def bow_mx(self):
        bow_feat = torch.FloatTensor(self._bow_mx)
        return self._activate_cuda(bow_feat)

    @property
    def topo_mx(self):
        assert self._topo_mx is not None, "Split train required"
        topo_feat = torch.FloatTensor(self._topo_mx)
        return self._activate_cuda(topo_feat)

    @property
    def adj_rt_mx(self):
        return self._activate_cuda(self._adj_rt.clone())

    @property
    def adj_mx(self):
        return self._activate_cuda(self._adj.clone())

    def split_test(self, test_p):
        indexes = range(len(self._nodes_order))
        self._train_set, test_set, self._base_train_idx, self._test_idx = train_test_split(self._nodes_order, indexes,
                                                                                           test_size=test_p,
                                                                                           shuffle=True)
        # test_set unused

    def _features_path(self):
        return os.path.join(self._data_path, "features%d" % (self._is_max_connected,))

    def split_train(self, train_p, features_meta=None):
        if features_meta is None:
            features_meta = self._features_meta
        train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx,
                                                                              test_size=1 - train_p, shuffle=True)

        features_path = self._features_path()
        features = GraphFeatures(self._gnx, features_meta, dir_path=features_path,
                                 logger=self._logger, is_max_connected=self._is_max_connected)
        features.build(include=set(train_set), should_dump=True)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio