def __init__(self,
                 data_path,
                 feature_meta,
                 test_p,
                 gnx_idx=None,
                 cuda_num=None,
                 logger=None):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._gnx_idx = gnx_idx
        self._cuda_num = cuda_num
        self._test_p = test_p
        self._features_meta = feature_meta
        self._data_path = data_path

        # self._logger.debug("Loading %s dataset...", self._dataset)
        # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb"))
        # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb"))
        # self._nodes_order = sorted(self._gnx)
        self._train_set = self._test_set = self._train_idx = self._test_idx = None
        self._inputs = self._targets = None
        self._nodes_order = []

        self._labels = {i: label for i, label in enumerate(self._get_labels())}
        self._prepare_data()
示例#2
0
    def __init__(self, data_dir, features_meta, is_max_connected=False, cuda_num=None, logger=None):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._data_path = data_dir
        self._cuda_num = cuda_num
        self._features_meta = features_meta
        self._is_max_connected = is_max_connected

        self._logger.debug("Loading %s dataset...", self._data_path)
        features_path = self._features_path()
        gpath = os.path.realpath(os.path.join(features_path, "..", "gnx.pkl"))
        self._gnx = pickle.load(open(gpath, "rb"))

        self._nodes_order = sorted(self._gnx)
        self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])}
        self._ident_labels = self._encode_onehot_gnx()

        self._content = pickle.load(open(os.path.join(self._data_path, "content.pkl"), "rb"))
        bow_mx = np.vstack([self._content[node] for node in self._nodes_order]).astype(DTYPE)
        self._bow_mx = normalize(bow_mx)
        self._topo_mx = None

        # Adjacency matrices
        adj = nx.adjacency_matrix(self._gnx, nodelist=self._nodes_order).astype(DTYPE)
        self._adj = handle_matrix_symmetric(adj)
        self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense()
        self._adj_rt = handle_matrix_concat(adj, should_normalize=True)
        self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense()

        self._train_set = self._test_set = None
        self._train_idx = self._test_idx = self._base_train_idx = None
        self._val_idx = None
示例#3
0
 def __init__(self, conf, logger, data_logger=None, is_nni=False):
     self._logger = logger
     self._data_logger = EmptyLogger() if data_logger is None else data_logger
     self._conf = conf
     self.bar = 0.5
     self._lr = conf["lr"]
     self._is_nni = is_nni
     self._device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
     self._ce_loss = self._soft_ce_loss
     self._temporal_loss = torch.nn.MSELoss(reduction='sum').to(self._device)
 def __init__(self, conf, logger, data_logger=None, is_nni=False):
     self._logger = logger
     self._data_logger = EmptyLogger(
     ) if data_logger is None else data_logger
     self._conf = conf
     self.bar = 0.5
     self._lr = conf["lr"]
     self._is_nni = is_nni
     self._device = torch.device(
         'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
     self._loss = BCELoss()
示例#5
0
    def __init__(self, data_path, cuda, logger, data_logger=None):
        self._logger = logger
        self._cuda = cuda
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger
        self._data_path = data_path

        # feature_meta = NEIGHBOR_FEATURES
        feature_meta = NODE_FEATURES
        # feature_meta = NODE_FEATURES.copy()
        # feature_meta.update(NEIGHBOR_FEATURES)
        self.loader = GraphLoader(feature_meta,
                                  cuda_num=cuda,
                                  logger=self._logger)
示例#6
0
    def __init__(self, dataset_path, conf, logger, data_logger=None):
        self._logger = logger
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger
        self._criterion = torch.nn.NLLLoss()
        self._conf = conf

        features_meta = get_features()
        self.loader = GraphLoader(
            dataset_path,
            features_meta,
            is_max_connected=False,  # self._conf['dataset'] == "citeseer",
            cuda_num=conf["cuda"],
            logger=self._logger)
示例#7
0
class SpecificFeatureTest(unittest.TestCase):
    logger = EmptyLogger()

    @classmethod
    def setUpClass(cls):
        cls._test_data = TestData(logger=cls.logger)

    def _test_feature(self,
                      feature_cls,
                      is_directed,
                      is_max_connected=True,
                      manual=None,
                      **cmp_features):
        gnx = get_di_graph() if is_directed else get_graph()
        gnx = filter_gnx(gnx, is_max_connected)
        feature = feature_cls(gnx, logger=self.logger)
        res = feature.build()
        # mx_res = feature.to_matrix()
        if manual is None:
            prev_res = self._test_data.load_feature(feature_cls, is_directed)
        else:
            prev_res = manual
        if prev_res is not None or feature.is_relevant():
            if not are_results_equal(res, prev_res, **cmp_features):
                are_results_equal(res, prev_res, **cmp_features)
            self.assertTrue(are_results_equal(res, prev_res, **cmp_features))
示例#8
0
 def attachDetachLogger(self, cpu):
     if type(cpu.logger) is not EmptyLogger:
         print("Detaching logger")
         cpu.logger = EmptyLogger()
     else:
         print("Attaching logger")
         cpu.logger = Logger(cpu)
    def __init__(self,
                 gnx,
                 features,
                 dir_path,
                 logger=None,
                 is_max_connected=False):
        self._base_dir = dir_path
        self._logger = EmptyLogger() if logger is None else logger
        self._matrix = None

        if is_max_connected:
            if gnx.is_directed():
                subgraphs = nx.weakly_connected_component_subgraphs(gnx)
            else:
                subgraphs = nx.connected_component_subgraphs(gnx)
            self._gnx = max(subgraphs, key=len)
        else:
            self._gnx = gnx

        self._abbreviations = {
            abbr: name
            for name, meta in features.items() for abbr in meta.abbr_set
        }

        # building the feature calculators data structure
        super(GraphFeatures, self).__init__({
            name: meta.calculator(self._gnx, logger=logger)
            for name, meta in features.items()
        })
示例#10
0
    def __init__(self, path_info, is_max_connected, *args, logger=None, cuda_num=None, is_debug=False, **kwargs):
        super(MultiGraphLoader, self).__init__()
        # def __init__(self, path_info, is_max_connected, norm_adj=True, cuda_num=None, logger=None):
        # path_info = {"years": os.path.realpath(os.path.join(PROJ_DIR, "..", "data", "firms", "years")),
        #              "label": "top"}
        self._path_info = path_info
        self._path_info["split"] = os.path.realpath(os.path.join(self._path_info["years"], "..", "split.pkl"))

        if logger is None:
            logger = EmptyLogger()
        self._logger = logger

        # TODO: implement dynamic loading of the data
        for year in sorted(os.listdir(path_info["years"]), key=int):
            data_path = os.path.realpath(os.path.join(path_info["years"], year))
            year_paths = {
                "features": os.path.join(data_path, "features%d" % (is_max_connected,)),
                "content": os.path.join(data_path, "content_clean.pkl"),
                # "content": os.path.join(data_path, path_info["label"], "content.pkl"),
                "gnx": os.path.join(data_path, path_info["label"], "gnx.pkl"),
                "name": str(year),
            }
            self[int(year)] = GraphLoader(year_paths, is_max_connected, *args, logger=self._logger, cuda_num=cuda_num, **kwargs)
            if is_debug and (1997 == int(year)): break

        self._nodes = np.array(self.ordered_nodes)  # getattr will take the first one
        self._test_idx = self._base_train_idx = None
        self._train_idx = self._val_idx = None
        self._should_split = not os.path.exists(self._path_info["split"])
        if not self._should_split:
            self._load_split()
示例#11
0
 def __init__(self, gnx, logger=None):
     # super(FeatureCalculator, self).__init__()
     self._is_loaded = False
     self._features = {}
     self._logger = EmptyLogger() if logger is None else logger
     self._gnx = gnx
     self._print_name = self.print_name()
示例#12
0
    def test_attachDetachLogger_attaches_logger_when_ther_is_Empty(self):
        debugger = Debugger()
        cpu = FakeCpu()
        cpu.logger = EmptyLogger()
        debugger.attachDetachLogger(cpu)

        self.assertFalse(type(cpu.logger) is EmptyLogger)
示例#13
0
    def __init__(self, *args, logger=None, **kwargs):
        super(BaseReactor, self).__init__()
        self._should_run = True
        self._logger = logger or EmptyLogger()

        self._selector = SelectSelector()
        self.register = self._selector.register
        self._logger.info("Initiating %s" % (type(self).__name__, ))
 def __init__(self, conf, GS,logger, data_logger=None, is_nni=False):
     self._logger = logger
     self._data_logger = EmptyLogger() if data_logger is None else data_logger
     self._conf = conf
     self.bar = 0.5
     self._lr = conf["lr"]
     self._is_nni = is_nni
     # choosing GPU device
     self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if self._device != "cpu":
         with torch.cuda.device("cuda:{}".format(CUDA_Device)):
             torch.cuda.empty_cache()
         if not self._is_nni:
             self._device = torch.device("cuda:{}".format(CUDA_Device))
     self._loss = self.graphSaintLoss if GS else self.regular_loss
     self.accuracy = self.accuracy_GraphSaint if GS else self.accuracy_regular
     self._ce_loss = torch.nn.CrossEntropyLoss(reduction="mean").to(self._device)
     self._ce_loss2 = torch.nn.BCELoss(reduction='mean')
示例#15
0
    def __init__(self, queue, calculators, include, logger=None):
        super(Worker, self).__init__()
        if logger is None:
            logger = EmptyLogger()

        self._queue = queue
        self._calculators = calculators
        self._logger = logger
        self._include = include
示例#16
0
    def __init__(self,
                 params,
                 loader,
                 cuda_device,
                 data_logger=None,
                 epochs_logger=None):

        self._params = params

        self.prev_training_inds = None
        self.prev_val_inds = None
        self.prev_test_inds = None

        self._epoch_logger = EmptyLogger(
        ) if epochs_logger is None else epochs_logger
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger

        self._device = torch.device(
            f'cuda:{cuda_device}') if torch.cuda.is_available(
            ) else torch.device('cpu')

        self._mse_loss = self.weighted_mse_loss
        self._temporal_loss = self.weighted_mse_loss

        self.net = None
        self.opt = None
        self.loader = loader
        self.num_features = loader.dataset[
            0].num_features  #loader[0].num_features

        # if SSP
        self.preconditioner = None
        self.eps = params['eps']
        self.update_freq = params['update_freq']
        self.gamma = params['gamma']
        self.alpha = params['alpha']
        self.lamda = params['lamda']

        self.best_loss = None
        # self.best_model = None
        self.best_epoch = None

        self.is_nni = params["is_nni"]
示例#17
0
 def __init__(self,
              conf,
              logger,
              weights,
              graph_params,
              data_logger=None,
              is_nni=False):
     self._logger = logger
     self._data_logger = EmptyLogger(
     ) if data_logger is None else data_logger
     self._conf = conf
     self._weights_dict = weights
     self._clique_size = graph_params['clique_size']
     self._graph_params = graph_params
     self.bar = 0.5
     self._lr = conf["lr"]
     self._is_nni = is_nni
     self._device = torch.device(
         'cuda') if torch.cuda.is_available() else torch.device('cpu')
示例#18
0
    def __init__(self,
                 products_path,
                 dataset_path,
                 conf,
                 logger,
                 data_logger=None):
        self.conf = conf
        self._logger = logger
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger
        self.products_path = products_path

        self.loader = GraphLoader(dataset_path,
                                  is_max_connected=False,
                                  norm_adj=conf["norm_adj"],
                                  cuda_num=conf["cuda"],
                                  logger=self._logger)

        self._criterion = torch.nn.NLLLoss()
示例#19
0
    def __init__(self,
                 gnx,
                 features,
                 dir_path,
                 logger=None,
                 is_max_connected=False):
        self._base_dir = dir_path
        self._logger = EmptyLogger() if logger is None else logger
        self._matrix = None

        self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx

        self._abbreviations = {
            abbr: name
            for name, meta in features.items() for abbr in meta.abbr_set
        }

        # building the feature calculators data structure
        super(GraphFeatures, self).__init__({
            name: meta.calculator(self._gnx, logger=logger)
            for name, meta in features.items()
        })
示例#20
0
    def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._paths = paths
        self._ignore_index = ignore_index
        self._cuda_num = cuda_num
        self._dtype = dtype

        self._logger.debug("Loading %s dataset...", paths["features"])
        self._gnx = pickle.load(open(paths["gnx"], "rb"))
        self._is_max_connected = is_max_connected
        if is_max_connected:
            self._gnx = get_max_subgraph(self._gnx)

        self.ordered_nodes = sorted(self._gnx)
        self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n])
        # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]]
        self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])}
        self._node_labels = self._get_node_labels()

        self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0]))
        bow_mx = np.vstack(self._content.values()).astype(DTYPE)
        median_bow = np.median(bow_mx, axis=0)
        bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE)

        self._bow_mx = z_scoring(bow_mx)
        self._topo_mx = None

        # Adjacency matrices
        adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE)
        self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj)
        self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense()
        self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj)
        self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense()

        self._train_set = self._test_set = None
        self._train_idx = self._test_idx = self._base_train_idx = None
        self._val_idx = None
示例#21
0
    def __init__(self, logger=None):
        if logger is None:
            logger = EmptyLogger()
        self._logger = logger
        self._data_dir = os.path.dirname(os.path.realpath(__file__))
        df1 = pd.read_csv(os.path.join(self._data_dir, "test_undirected"))
        self._ugnx = nx.from_pandas_edgelist(df1,
                                             "n1",
                                             "n2", ["weight"],
                                             create_using=nx.Graph())

        df2 = pd.read_csv(os.path.join(self._data_dir, "test_directed"))
        self._gnx = nx.from_pandas_edgelist(df2,
                                            "n1",
                                            "n2", ["weight"],
                                            create_using=nx.DiGraph())
示例#22
0
 def __init__(self,
              reader=None,
              crucial=True,
              state_notifier=None,
              reactor=None,
              logger=None):
     super(Channel, self).__init__()
     self._state_notifier = state_notifier  # handler = handler or ChannelHandler(self)
     self._reactor = reactor or AsyncReactor()
     self._logger = logger or EmptyLogger()
     self._state = ChannelState.IDLE
     self._crucial = crucial
     if reader is not None:
         reader.set_channel(self)
     else:
         reader = DefaultReader(1, self)
     self._reader = reader
     self._buffered = ""
示例#23
0
 def __init__(self, params, logger, data_logger=None, epochs_logger=None):
     self._logger = logger
     self._epoch_logger = epochs_logger
     self._data_logger = EmptyLogger() if data_logger is None else data_logger
     self._parameters = params
     self._lr = params["lr"]
     self._is_nni = params['is_nni']
     self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
     self._mse_loss = self.weighted_mse_loss
     self._temporal_loss = self.weighted_mse_loss
     self.model = GCN(num_of_features=self._parameters["feature_matrices"][0].shape[1],
                      hid_size=self._parameters["hid_size"],
                      num_of_classes=self._parameters["number_of_classes"],
                      activation=self._parameters["activation"],
                      dropout=self._parameters["dropout"])
     self.model = self.model.to(self._device)
     self.opt = self._parameters["optimizer"](self.model.parameters(),
                                              lr=self._parameters['lr'],
                                              weight_decay=self._parameters['weight_decay'])
示例#24
0
    def __init__(self,
                 paths,
                 fast_mode,
                 norm_adj,
                 cuda_dev,
                 is_max,
                 logger,
                 data_logger=None,
                 is_debug=False,
                 dtype=torch.double):
        # plt.rcParams.update({'figure.max_open_warning': 0})
        self._logger = logger
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger
        self._fast_mode = fast_mode
        self.products_path = paths["products"]
        self._paths = paths
        self._cuda_dev = cuda_dev
        self._dtype = dtype

        self.loaders = MultiGraphLoader(paths,
                                        is_max,
                                        norm_adj=norm_adj,
                                        logger=self._logger,
                                        ignore_index=-1,
                                        cuda_num=cuda_dev,
                                        is_debug=is_debug,
                                        dtype=dtype)

        criterion_weight = torch.FloatTensor([1 / 34, 1 / 740])
        if cuda_dev is not None:
            criterion_weight = criterion_weight.cuda(cuda_dev)
        self._criterion_weight = criterion_weight
        self._criterion = torch.nn.NLLLoss(weight=self._criterion_weight,
                                           ignore_index=-1)
        # self._criterion = torch.nn.CrossEntropyLoss(weight=criterion_weight, ignore_index=-1)
        self._criterion = self._criterion.type(self._dtype).cuda(
            self._cuda_dev)
        self._run_label = ""
        self._reset_saved_models()
示例#25
0
class GraphLoader(object):
    def __init__(self, paths, is_max_connected, ignore_index=-1, norm_adj=True, logger=None, cuda_num=None, dtype=torch.double):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._paths = paths
        self._ignore_index = ignore_index
        self._cuda_num = cuda_num
        self._dtype = dtype

        self._logger.debug("Loading %s dataset...", paths["features"])
        self._gnx = pickle.load(open(paths["gnx"], "rb"))
        self._is_max_connected = is_max_connected
        if is_max_connected:
            self._gnx = get_max_subgraph(self._gnx)

        self.ordered_nodes = sorted(self._gnx)
        self._labeled_nodes = set(i for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n])
        # self._labeled_nodes = [(i, n) for i, n in enumerate(self.ordered_nodes) if "label" in self._gnx.node[n]]
        self._labels = {i: label for i, label in enumerate(self._gnx.graph["node_labels"])}
        self._node_labels = self._get_node_labels()

        self._content = OrderedDict(sorted(pickle.load(open(paths["content"], "rb")).items(), key=lambda x: x[0]))
        bow_mx = np.vstack(self._content.values()).astype(DTYPE)
        median_bow = np.median(bow_mx, axis=0)
        bow_mx = np.vstack([self._content.get(node, median_bow) for node in self.ordered_nodes]).astype(DTYPE)

        self._bow_mx = z_scoring(bow_mx)
        self._topo_mx = None

        # Adjacency matrices
        adj = nx.adjacency_matrix(self._gnx, nodelist=self.ordered_nodes).astype(DTYPE)
        self._adj = handle_matrix_symmetric(adj, should_normalize=norm_adj)
        self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense()
        self._adj_rt = handle_matrix_concat(adj, should_normalize=norm_adj)
        self._adj_rt = sparse_mx_to_torch_sparse_tensor(self._adj_rt).to_dense()

        self._train_set = self._test_set = None
        self._train_idx = self._test_idx = self._base_train_idx = None
        self._val_idx = None

    @property
    def name(self):
        return str(self._paths["name"])

    @property
    def is_graph_directed(self):
        return self._gnx.is_directed()

    # def _activate_cuda(self, items):
        # return items

    # def _encode_onehot_gnx(self):  # gnx, nodes_order: list = None):
    #     labels = self._labels.copy()
    #     if labels[len(labels) - 1] is not None:
    #         labels[len(labels)] = None
    #     ident = np.identity(len(labels))
    #     if self._gnx.graph.get('is_index_labels', False):
    #         labels_dict = {i: ident[i, :] for i, label in labels.items()}
    #     else:
    #         labels_dict = {label: ident[i, :] for i, label in labels.items()}
    #     return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self._nodes_order)),
    #                     dtype=np.int32)

    def _get_node_labels(self):
        labels = self._labels.copy()
        labels[self._ignore_index] = None
        labels_dict = {label: i for i, label in labels.items()}
        return np.array(list(map(lambda n: labels_dict[self._gnx.node[n].get('label')], self.ordered_nodes)),
                        dtype=np.int32)

    def set_variables(self, **kwargs):
        for key, val in kwargs.items():
            self.__setattr__(key, val)

    @property
    def num_labels(self):
        return len(self._labels)

    # @property
    # def labels(self):
    #     labels = torch.LongTensor(self._node_labels)
    #     return activate_cuda(labels, cuda_num=self._cuda_num)

    @property
    def labels(self):
        labels = torch.LongTensor(self._node_labels)
        return activate_cuda(labels, cuda_num=self._cuda_num)

    @property
    def distinct_labels(self):
        return sorted(self._labels.keys())

    # def _get_idx(self, idx_name):
    #     return torch.LongTensor([x for x in getattr(self, idx_name) if x in set(self._labeled_nodes)])
    #
    # @property
    # def train_idx(self):
    #     return activate_cuda(self._get_idx("_train_idx"), cuda_num=self._cuda_num)
    #
    # @property
    # def val_idx(self):
    #     return activate_cuda(self._get_idx("_val_idx"), cuda_num=self._cuda_num)
    #
    # @property
    # def test_idx(self):
    #     return activate_cuda(self._get_idx("_test_idx"), cuda_num=self._cuda_num)

    @property
    def bow_mx(self):
        # bow_feat = torch.FloatTensor(self._bow_mx)
        bow_feat = torch.DoubleTensor(self._bow_mx)
        return activate_cuda(bow_feat, cuda_num=self._cuda_num)

    @property
    def topo_mx(self):
        assert self._topo_mx is not None, "Split train required"
        # topo_feat = torch.FloatTensor(self._topo_mx)
        topo_feat = torch.DoubleTensor(self._topo_mx)
        return activate_cuda(topo_feat, cuda_num=self._cuda_num)

    @property
    def adj_rt_mx(self):
        return activate_cuda(self._adj_rt, cuda_num=self._cuda_num)  # .clone())

    @property
    def adj_mx(self):
        return activate_cuda(self._adj, cuda_num=self._cuda_num).type(self._dtype)  # .clone())

        # def split_test(self, test_p):
        #     indexes, nodes = zip(*self._labeled_nodes)
        #     self._train_set, _, self._base_train_idx, self._test_idx = train_test_split(nodes, indexes, test_size=test_p)

        # def split_train(self, train_p, features_meta):
        #     train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx,
        #                                                                           test_size=1 - train_p)
        #     feat_path = os.path.join(self._feat_path, "features%d" % (self._is_max_connected,))
        # features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
        #                          is_max_connected=False)  # Already taking the max sub_graph in init
        # features.build(include=set(train_set), should_dump=False)
        #
        # add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        # self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)
        #
        # ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        # self._topo_mx /= ratio

    def set_train(self, train_set, features_meta):
        features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
                                 is_max_connected=False)  # Already taking the max sub_graph in init
        features.build(include=set(train_set), should_dump=True)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
示例#26
0
class GraphLoader(object):
    def __init__(self,
                 data_path,
                 feature_meta,
                 test_p,
                 gnx_idx=None,
                 cuda_num=None,
                 logger=None):
        super(GraphLoader, self).__init__()
        self._logger = EmptyLogger() if logger is None else logger
        self._gnx_idx = gnx_idx
        self._cuda_num = cuda_num
        self._test_p = test_p
        self._features_meta = feature_meta
        self._data_path = data_path

        # self._logger.debug("Loading %s dataset...", self._dataset)
        # self._gnx = pickle.load(open(os.path.join(self.dataset_path, "gnx.pkl"), "rb"))
        # self._content = pickle.load(open(os.path.join(self.dataset_path, "content.pkl"), "rb"))
        # self._nodes_order = sorted(self._gnx)
        self._train_set = self._test_set = self._train_idx = self._test_idx = None
        self._inputs = self._targets = None
        self._nodes_order = []

        self._labels = {i: label for i, label in enumerate(self._get_labels())}
        self._prepare_data()

    def _get_labels(self):
        gnx = pickle.load(
            open(os.path.join(next(self._get_gnx_paths()), "gnx.pkl"), "rb"))
        return gnx.graph["node_labels"]

    @staticmethod
    def _encode_onehot_gnx1(gnx,
                            nodes_order):  # gnx, nodes_order: list = None):
        labels = gnx.graph["node_labels"]
        labels_dict = {
            c: np.identity(len(labels))[i, :]
            for i, c in enumerate(labels)
        }
        labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)})
        return np.array(list(
            map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)),
                        dtype=np.int32)

    def _encode_onehot_gnx(self, gnx,
                           nodes_order):  # gnx, nodes_order: list = None):
        # labels = gnx.graph["node_labels"]
        ident = np.identity(len(self._labels))
        labels_dict = {label: ident[j, :] for j, label in self._labels.items()}
        # labels_dict = {c: np.identity(len(self._labels))[i, :] for i, c in enumerate(labels)}
        # labels_dict.update({i: labels_dict[c] for i, c in enumerate(labels)})
        return np.array(list(
            map(lambda n: labels_dict[gnx.node[n]['label']], nodes_order)),
                        dtype=np.int32)

    def _join_graphs1(self):
        all_nodes = set()
        common_nodes = None
        for path in self._get_gnx_paths():
            gnx = pickle.load(open(os.path.join(path, "orig_gnx.pkl"), "rb"))
            all_nodes = all_nodes.union(gnx)
            if common_nodes is None:
                common_nodes = set(gnx)
            else:
                common_nodes = common_nodes.intersection(gnx)

        pickle.dump(
            all_nodes,
            open(os.path.join(path, "..", "..", "all_nodes.pkl"), "wb"))
        pickle.dump(
            common_nodes,
            open(os.path.join(path, "..", "..", "common_nodes.pkl"), "wb"))
        return all_nodes, common_nodes

    def _join_graphs(self):
        path = next(self._get_gnx_paths())
        all_nodes = pickle.load(
            open(os.path.join(path, "..", "..", "all_nodes.pkl"), "rb"))
        common_nodes = pickle.load(
            open(os.path.join(path, "..", "..", "common_nodes.pkl"), "rb"))
        return all_nodes, common_nodes

    def _split_data(self):
        feat_path = os.path.join(next(self._get_gnx_paths()), "features_0")
        gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
        self._nodes_order = sorted(
            [node for node in gnx if gnx.node[node]['label'] is not None])
        indexes = [(i, node) for i, node in enumerate(self._nodes_order)]

        idx, nodes = zip(*indexes)
        c_train, c_test, c_train_idx, c_test_idx = train_test_split(
            nodes, idx, test_size=self._test_p, shuffle=True)

        self._train_set = set(c_train)
        self._test_set = set(c_test)
        self._test_idx = np.array(c_test_idx)
        self._train_idx = np.array(c_train_idx)

    def _split_data_orig(self):
        all_nodes, common_nodes = self._join_graphs()

        self._nodes_order = sorted(all_nodes)
        indexes = [(i, node) for i, node in enumerate(self._nodes_order)]
        common, uncommon = [], []
        for i, node in indexes:
            cur_list = common if node in common_nodes else uncommon
            cur_list.append((i, node))
        c_idx, c_nodes = zip(*common)
        c_train, c_test, c_train_idx, c_test_idx = train_test_split(
            c_nodes, c_idx, test_size=self._test_p, shuffle=True)
        uc_idx, uc_nodes = zip(*uncommon)
        uc_train, uc_test, uc_train_idx, uc_test_idx = train_test_split(
            uc_nodes, uc_idx, test_size=self._test_p, shuffle=True)

        self._train_set = set(c_train).union(uc_train)
        self._test_set = set(c_test).union(uc_test)
        self._test_idx = np.array(c_test_idx + uc_test_idx)
        self._train_idx = np.array(c_train_idx + uc_train_idx)

    def _activate_cuda(self, *args):
        if self._cuda_num is None:
            return args
        return [x.cuda(self._cuda_num) for x in args]

    # firms/years/features_0-1
    def _get_gnx_paths(self):
        paths = sorted(os.listdir(self._data_path), key=int)
        if self._gnx_idx is not None:
            # for x in [4, 6]:
            #     yield os.path.join(self._data_path, paths[x])
            yield os.path.join(self._data_path, paths[self._gnx_idx])
            return
        for path in paths:
            yield os.path.join(self._data_path, path)

    def _prepare_data1(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            cur_data = pickle.load(
                open(os.path.join(feat_path, "data.pkl"), "rb"))
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            cur_labels = pickle.load(
                open(os.path.join(feat_path, "labels.pkl"), "rb"))
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
        self._inputs = self._inputs.transpose((0, 2, 1))
        self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")

    def _prepare_data(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
            gnx = gnx.subgraph(self._nodes_order)

            features = GraphFeatures(gnx,
                                     self._features_meta,
                                     dir_path=feat_path,
                                     logger=self._logger)
            features.build(include=self._train_set)

            add_ones = bool(
                set(self._features_meta).intersection(
                    ["first_neighbor_histogram", "second_neighbor_histogram"]))
            cur_data = features.to_matrix(add_ones=add_ones,
                                          dtype=np.float32,
                                          mtype=np.array,
                                          should_zscore=True)
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            pickle.dump(cur_data,
                        open(os.path.join(feat_path, "data.pkl"), "wb"))

            cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order)
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
            pickle.dump(cur_labels,
                        open(os.path.join(feat_path, "labels.pkl"), "wb"))

        # Arranging data as <batch, seq, feature>
        if self._gnx_idx is None:
            self._inputs = self._inputs.transpose((0, 2, 1))
            self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")
        # topo_x = torch.FloatTensor(topo_x)  # np.array(features.todense()))
        #
        # labels = torch.LongTensor(np.where(labels)[1])
        #
        # train_idx = torch.LongTensor(self._train_idx)
        # test_idx = torch.LongTensor(self._test_idx)
        #
        # topo_x, labels = convert_to_variable(topo_x, labels)
        # return self.activate_cuda([topo_x, labels])

    def _load_data(self, indexes, nbatch):
        # for inputs, targets in zip(self._inputs, self._targets):
        inputs, targets = self._inputs[indexes], self._targets[indexes]
        # for i in range(0, int(len(inputs) / nbatch) * nbatch, nbatch):
        for i in range(0, len(inputs), nbatch):
            data, labels = inputs[i:i + nbatch], targets[i:i + nbatch]
            # if self._gnx_idx is not None:
            #     data, labels = data[:, self._gnx_idx, :], labels[:, self._gnx_idx, :]
            data, labels = Variable(torch.FloatTensor(data)), Variable(
                torch.LongTensor(np.where(labels)[self.feat_dim]))
            # labels = labels[labels != reverse_labels[None]]
            yield self._activate_cuda(data, labels)

    def load_train_data(self, nbatch):
        return self._load_data(self._train_idx, nbatch)

    def load_test_data(self, nbatch):
        return self._load_data(self._test_idx, nbatch)

    @property
    def feat_dim(self):
        return 2 if self._gnx_idx is None else 1

    @property
    def num_nodes(self):
        return self._inputs.shape[0]

    @property
    def sequence_len(self):
        return self._inputs.shape[1]

    @property
    def num_features(self):
        return self._inputs.shape[self.feat_dim]

    @property
    def num_labels(self):
        return len(self._labels)

    @property
    def labels(self):
        return self._labels

    @property
    def num_layers(self):
        return [100, 20]
class ModelRunner:
    def __init__(self, conf, logger, data_logger=None, is_nni=False):
        self._logger = logger
        self._data_logger = EmptyLogger(
        ) if data_logger is None else data_logger
        self._conf = conf
        self.bar = 0.5
        self._lr = conf["lr"]
        self._is_nni = is_nni
        self._device = torch.device(
            'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        self._loss = BCELoss()

    @property
    def logger(self):
        return self._logger

    @property
    def data_logger(self):
        return self._data_logger

    def my_loss(self, output, target, weights=None):
        output = torch.clamp(output, min=1e-8, max=1 - 1e-8)

        if weights is not None:
            assert len(weights) == 2
            loss = weights[1] * (target * torch.log(output)) + \
                   weights[0] * ((1 - target) * torch.log(1 - output))
        else:
            loss = target * torch.log(output) + (
                1 - target) * torch.log(1 - output)
        ret = torch.neg(torch.mean(loss))
        return ret

    def accuracy(self, output, labels):
        #output = torch.sigmoid(output) ##todo USE it only with BCEWithLogit
        idxs_1_labeled = torch.where(labels == 1)
        answers = output[idxs_1_labeled]
        true_pos = torch.where(answers >= 0.5)  # tuple (,)
        return len(true_pos[0]) / len(idxs_1_labeled[0])

    def _get_model(self):
        model = Graphs_Rec(in_features=self._conf["train_data"][0].shape[0],
                           hid_features=self._conf["hid_features"],
                           out_features=1,
                           activation=self._conf["activation"],
                           dropout=self._conf["dropout"])
        opt = self._conf["optimizer"](model.parameters(),
                                      lr=self._conf["lr"],
                                      weight_decay=self._conf["weight_decay"])
        ##checged : added "feature_matrices"
        return {
            "model": model,
            "optimizer": opt,
            "train_data": self._conf["train_data"],
            "training_labels": self._conf["training_labels"],
            "test_data": self._conf["test_data"],
            "test_labels": self._conf["test_labels"]
        }

    # verbose = 0 - silent
    # verbose = 1 - print test results
    # verbose = 2 - print train for each epoch and test results
    def run(self, verbose=2):
        if self._is_nni:
            verbose = 0
        model = self._get_model()
        ##
        loss_train, acc_train, intermediate_acc_test, losses_train, accs_train, test_results = self.train(
            self._conf["epochs"], model=model, verbose=verbose)
        ##
        # Testing
        result = self.test(model=model,
                           verbose=verbose if not self._is_nni else 0,
                           print_to_file=False)
        if self._is_nni:
            self._logger.debug('Final loss train: %3.4f' % loss_train)
            self._logger.debug('Final accuracy train: %3.4f' % acc_train)
            final_results = result["acc"]
            self._logger.debug('Final accuracy test: %3.4f' % final_results)
            # _nni.report_final_result(test_auc)

        if verbose != 0:
            names = ""
            vals = ()
            for name, val in result.items():
                names = names + name + ": %3.4f  "
                vals = vals + tuple([val])
                self._data_logger.info(name, val)
        parameters = {
            "lr": self._conf["lr"],
            "weight_decay": self._conf["weight_decay"],
            "dropout": self._conf["dropout"],
            "optimizer": self._conf["optim_name"]
        }
        return loss_train, acc_train, intermediate_acc_test, result, losses_train, accs_train, test_results, parameters

    def train(self, epochs, model=None, verbose=2):
        loss_train = 0.
        acc_train = 0.
        losses_train = []
        accs_train = []
        test_results = []
        intermediate_test_acc = []
        for epoch in range(epochs):
            loss_train, acc_train = self._train(epoch, model, verbose)
            ##
            losses_train.append(loss_train)
            accs_train.append(acc_train)
            ##
            # /----------------------  FOR NNI  -------------------------
            if epoch % 5 == 0:
                test_res = self.test(
                    model, verbose=verbose if not self._is_nni else 0)
                test_results.append(test_res)
                if self._is_nni:
                    test_acc = test_res["acc"]
                    intermediate_test_acc.append(test_acc)

        return loss_train, acc_train, intermediate_test_acc, losses_train, \
               accs_train, test_results

    def _train(self, epoch, model, verbose=2):
        #self._loss = self._loss = BCEWithLogitsLoss(torch.ones([223653]).to(self._device))

        model_ = model["model"]
        model_ = model_.to(self._device)
        optimizer = model["optimizer"]

        ###!
        labels = torch.from_numpy(model["training_labels"]).to(
            dtype=torch.float, device=self._device)
        labels = torch.DoubleTensor(model["training_labels"]).to(
            dtype=torch.float, device=self._device)  ###todo
        train = torch.from_numpy(model["train_data"]).to(dtype=torch.float,
                                                         device=self._device)
        model_.train()
        optimizer.zero_grad()
        self._loss = self.my_loss

        ###send the model
        output = model_(train)
        ###

        loss_train = 0.
        labeld_1_num = len([b for b, item in enumerate(labels) if item == 1])
        output = output.view(output.shape[0])  ###todo!
        # loss_train += self._loss(output, labels, weights=[1,(len(train)-78)/78]) ##weights=[19/len(train),(len(train)-19)/len(train)])
        loss_train += self._loss(
            output,
            labels,
            weights=[
                len(train) / (len(train) - labeld_1_num),
                len(train) / labeld_1_num
            ])  ##weights=[19/len(train),(len(train)-19)/len(train)])
        # loss_train /= len(train)
        loss_train.backward()
        optimizer.step()

        acc_train = self.accuracy(output, labels)

        if verbose == 2:
            # Evaluate validation set performance separately,
            # deactivates dropout during validation run.
            self._logger.debug(
                'Epoch: {:04d} '.format(epoch + 1) +
                'loss_train: {:.4f} '.format(loss_train.data.item()) +
                'acc_train: {:.4f} '.format(acc_train))
        return loss_train, acc_train

    def test(self, model=None, verbose=2, print_to_file=False):
        #self._loss=self._loss = BCEWithLogitsLoss(torch.ones([894618]).to(self._device))
        model_ = model["model"]
        model_ = model_.to(self._device)

        labels = torch.from_numpy(model["test_labels"]).to(dtype=torch.float,
                                                           device=self._device)
        labels = torch.DoubleTensor(model["test_labels"]).to(
            dtype=torch.float, device=self._device)  ###todo###
        test = torch.from_numpy(model["test_data"]).to(dtype=torch.float,
                                                       device=self._device)
        model_.eval()
        '''self._loss = self.my_loss
        pos_weight = torch.ones([len(test)]).to(self._device)  # All weights are equal to 1
        pos_weight *= 79 / (len(test) - 79)
        self._loss = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)'''

        ###send the model
        output = model_(test)
        ###

        output = output.view(output.shape[0])  ###todo!
        self._loss = self.my_loss
        loss_test = 0.
        loss_test += self._loss(output,
                                labels)  #, weights=[1, (len(test) - 20) / 20])

        #loss_test += self._loss(output, labels)
        #loss_test /= len(test)

        acc_test = self.accuracy(output, labels)

        if verbose != 0:
            self._logger.info(
                "Test: loss= {:.4f} ".format(loss_test.data.item()) +
                "acc= {:.4f}".format(acc_test))
        result = {"loss": loss_test.data.item(), "acc": acc_test}
        return result
示例#28
0
 def __init__(self, reactor_cls=None, logger=None):
     reactor_cls = reactor_cls or AsyncReactor
     self._reactor = reactor_cls()
     self._logger = logger or EmptyLogger()
class ModelRunner:
    def __init__(self, conf, GS,logger, data_logger=None, is_nni=False):
        self._logger = logger
        self._data_logger = EmptyLogger() if data_logger is None else data_logger
        self._conf = conf
        self.bar = 0.5
        self._lr = conf["lr"]
        self._is_nni = is_nni
        # choosing GPU device
        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self._device != "cpu":
            with torch.cuda.device("cuda:{}".format(CUDA_Device)):
                torch.cuda.empty_cache()
            if not self._is_nni:
                self._device = torch.device("cuda:{}".format(CUDA_Device))
        self._loss = self.graphSaintLoss if GS else self.regular_loss
        self.accuracy = self.accuracy_GraphSaint if GS else self.accuracy_regular
        self._ce_loss = torch.nn.CrossEntropyLoss(reduction="mean").to(self._device)
        self._ce_loss2 = torch.nn.BCELoss(reduction='mean')


    @property
    def logger(self):
        return self._logger

    @property
    def data_logger(self):
        return self._data_logger

    def graphSaintLoss(self, calcs, beta=None, gamma=None):
        if beta is None:
            beta = 1 / len(calcs["f_ns_out"])  if len(calcs["f_ns_out"])!=0 else 0
            gamma = 1 / len(calcs["s_ns_out"])  if len(calcs["s_ns_out"])!=0 else 0

        cn_loss = self._ce_loss2(calcs["cn_out"], calcs["cn_label"].float())
        f_ns_loss = self._ce_loss2(calcs["f_ns_out"], calcs["f_ns_labels"].float()) *(beta) if len(calcs["f_ns_out"])!=0 else 0
        s_ns_loss =  self._ce_loss2(calcs["s_ns_out"], calcs["s_ns_labels"].float()) * (gamma) if len(calcs["s_ns_out"])!=0 else 0
        return cn_loss+f_ns_loss+s_ns_loss

    def regular_loss(self, calcs, beta=None, gamma=None):
        if beta is None:
            beta = 1 / len(calcs["f_ns_out"])  if len(calcs["f_ns_out"])!=0 else 0
            gamma = 1 / len(calcs["s_ns_out"])  if len(calcs["s_ns_out"])!=0 else 0

        cn_loss = self._ce_loss(calcs["cn_out"], calcs["cn_label"].long())
        f_ns_loss = self._ce_loss(calcs["f_ns_out"], calcs["f_ns_labels"].long()) *(beta) if len(calcs["f_ns_out"])!=0 else 0
        s_ns_loss =  self._ce_loss(calcs["s_ns_out"], calcs["s_ns_labels"].long()) * (gamma) if len(calcs["s_ns_out"])!=0 else 0
        return cn_loss+f_ns_loss+s_ns_loss


    def _get_model(self):
        model = GCN(in_features=self._conf["in_features"],
                    hid_features=self._conf["hid_features"], out_features= self._conf["out_features"],
                    activation=self._conf["activation"], dropout= self._conf["dropout"])
        opt = self._conf["optimizer"](model.parameters(), lr=self._conf["lr"], weight_decay=self._conf["weight_decay"])
        return {"model": model, "optimizer": opt,
                "beta": self._conf["beta"],"gamma": self._conf["gamma"],
                "labels": self._conf["labels"], "X": self._conf["X"], "ds_name": self._conf["ds_name"], "adj_tr":  self._conf["adj_tr"], "adj_te":  self._conf["adj_te"],
                "train_ind":  self._conf["train_ind"], "test_ind":  self._conf["test_ind"], "testt": self._conf["testt"], "traint": self._conf["traint"]
                }


    # verbose = 0 - silent
    # verbose = 1 - print test results
    # verbose = 2 - print train for each epoch and test results
    def run(self, verbose=2):

        if self._is_nni:
            verbose = 0
        model = self._get_model()
        ##
        loss_train, acc_train, intermediate_acc_test, losses_train, accs_train,  accs_cn_train, accs_f_train, accs_s_train, test_results = self.train(
            self._conf["epochs"],
            model=model,
            verbose=verbose)
        ##
        # Testing . ## result is only the last one! do not use. same as 7 in last
        result = self.test(model=model, verbose=verbose if not self._is_nni else 0, print_to_file=True)
        test_results.append(result)
        if self._is_nni:
            self._logger.debug('Final loss train: %3.4f' % loss_train)
            self._logger.debug('Final accuracy train: %3.4f' % acc_train)
            final_results = result["acc"]
            self._logger.debug('Final accuracy test: %3.4f' % final_results)
            # _nni.report_final_result(test_auc)

        if verbose != 0:
            names = ""
            vals = ()
            for name, val in result.items():
                names = names + name + ": %3.4f  "
                vals = vals + tuple([val])
                self._data_logger.info(name, val)
        parameters = { "lr": self._conf["lr"],
                      "weight_decay": self._conf["weight_decay"],
                      "dropout": self._conf["dropout"], "optimizer": self._conf["optim_name"]}
        return loss_train, acc_train, intermediate_acc_test, result, losses_train, accs_train, accs_cn_train, accs_f_train, accs_s_train, test_results, parameters




    def train(self, epochs, model=None, verbose=2):
        loss_train = 0.
        acc_train = 0.
        losses_train = []
        accs_train = []
        accs_cn_train = []
        accs_f_train = []
        accs_s_train = []


        test_results = []
        intermediate_test_acc = []
        for epoch in range(epochs):
            loss_train, acc_train, acc_train_cn , acc_train_f, acc_train_s= self._train(epoch, model, verbose)

            losses_train.append(loss_train)
            accs_train.append(acc_train)
            accs_cn_train.append(acc_train_cn)
            accs_f_train.append(acc_train_f)
            accs_s_train.append(acc_train_s)
            ##
            # /----------------------  FOR NNI  -------------------------
            if epoch % 5 == 0:
                test_res = self.test(model, verbose=verbose if not self._is_nni else 0)
                test_results.append(test_res)
                if self._is_nni:
                    test_acc = test_res["acc"]
                    intermediate_test_acc.append(test_acc)

        return loss_train, acc_train, intermediate_test_acc, losses_train, \
                accs_train, accs_cn_train, accs_f_train, accs_s_train, test_results

    ''' This function calculates the output and the labels for each node:
        for each node we take as an input the nodels' output and the labels, and return the output and label of the central node, of it's
        first neighbors and of it's second neighbors. NOTE: we take only those that are in train indices
    '''
    def calculate_labels_outputs(self,node,  outputs , labels, indices, ego_graph):
        f_neighbors = set(list(ego_graph.neighbors(node)))
        s_neighbors = set()
        #create second neighbors
        for f_neighbor in f_neighbors:
            for s_neighbor in ego_graph.neighbors(f_neighbor):
                if s_neighbor not in f_neighbors and s_neighbor != node and s_neighbor not in s_neighbors:
                    s_neighbors.add(s_neighbor)
        # notice we use the "index" in order to have correlation between the neihbors and the output's index (graph nodes are labeld with numbers from 0 to N (of the big graph) and the output's labels from 0 to n=len(ego graph). so using the "index" solves it (hopefully) ;)
        cn_out= outputs[[list(ego_graph.nodes).index(node)]]
        cn_label = labels[[node]]

        #create vectors for the first neighbors outputs and labels. NOTE: we take only those that are in train indices
        f_ns_out = outputs[[list(ego_graph.nodes).index(f_n) for f_n in f_neighbors if indices[f_n]]]
        f_ns_labels = labels[[f_n for f_n in f_neighbors if indices[f_n]]]
        #same for second neoghbors
        s_ns_out = outputs[[list(ego_graph.nodes).index(s_n) for s_n in s_neighbors if indices[s_n]]]
        s_ns_labels = labels[[s_n for s_n in s_neighbors if indices[s_n]]]
        return { "cn_out": cn_out, "cn_label":  cn_label, "f_ns_out": f_ns_out, "f_ns_labels": f_ns_labels,  "s_ns_out": s_ns_out, "s_ns_labels": s_ns_labels }


    def _train(self, epoch, model, verbose=2):
        model_ = model["model"]
        model_ = model_.to(self._device)
        optimizer = model["optimizer"]
        #train ind are the nodes to create subgraphs from. traint are nodes in train (that we can learn from)
        train_indices = model["train_ind"]
        model["labels"] = model["labels"].to(self._device)
        labels = model["labels"]
        beta = model["beta"]
        gamma = model["gamma"]
        model_.train()
        optimizer.zero_grad()

        loss_train = 0.
        loss_train1 = 0.
        calcs_batch = []
        BATCH_SIZE= 30
        # create subgraphs only for partial, but use labels of all train indices
        for idx,node in enumerate(train_indices):

            # adj = nx.ego_graph(model["adj_matrices"], node, radius=2)
            adj = model["adj_tr"][node]
            X_t = model["X"][list(adj.nodes)].to(device=self._device)
            output = model_(X_t, nx.adjacency_matrix(adj).tocoo())
            calcs = self.calculate_labels_outputs( node, output, labels, model["traint"], adj)
            #no batches:
            loss_train += self._loss(calcs, beta, gamma)

            # # if we want to use batches
            # loss_train1 += self._loss(calcs, beta, gamma)
            # loss_train += self._loss(calcs, beta, gamma).data.item()
            # if idx % BATCH_SIZE == 0 and idx > 0:
            #     loss_train1 /= BATCH_SIZE
            #     loss_train1.backward()
            #     optimizer.step()
            #     loss_train1 = 0.

            calcs_batch.append(calcs)

        acc_train, acc_train_cn, acc_train_f, acc_train_s = self.accuracy(calcs_batch)
        #
        loss_train /= len(train_indices)
        #
        loss_train.backward()
        optimizer.step()

        if verbose == 2:
            # Evaluate validation set performance separately,
            # deactivates dropout during validation run.
            self._logger.debug('Epoch: {:04d} '.format(epoch + 1) +
                               'ce_loss_train: {:.4f} '.format(loss_train) +

                               'acc_train: {:.4f} '.format(acc_train))
        return loss_train, acc_train, acc_train_cn , acc_train_f, acc_train_s


    ''' Accuracy function. For the graphSaint we use sigmoid on each index, then we use BCE loss, then we span the vectors(of each node's result) to one vector of all the centrals, 
        one vector of all the first neighbors, and one vector of all the second neghbors,
        put 1 in the indexes that have value >= 0.5 and 0 otherwise, then calculate f1 score on the vector
    '''
    @staticmethod
    def accuracy_GraphSaint(calcs):

        #create one vector that will contain all the central's nodes outputs (for each node in the train/test). same for labels
        out, labs = ([calcs[i]["cn_out"].data[0].tolist() for i in range(len(calcs))],
                     [calcs[i]["cn_label"].data[0].tolist() for i in range(len(calcs))])
        out = np.array(out)
        labs = np.array(labs)
        out[out > 0.5] = 1
        out[out <= 0.5] = 0
        acc_cn  = metrics.f1_score(labs, out, average="micro")

        # create one vector that will contain all the first neighbors outputs (of each node in the train/test. each node has vector of first neighbors- put it all to one long vector)
        out = []
        labs = []
        for i in range(len(calcs)):
            out += calcs[i]["f_ns_out"].data.tolist()
            labs += calcs[i]["f_ns_labels"].data.tolist()

        out=np.array(out)
        labs=np.array(labs)
        out[out > 0.5] = 1
        out[out <= 0.5] = 0

        if len(out) != 0:
            acc_f = metrics.f1_score(labs, out, average="micro")
        else:
            acc_f = np.nan

        # same for second neighbors (same as first)
        out = []
        labs = []
        for i in range(len(calcs)):
            out += calcs[i]["s_ns_out"].data.tolist()
            labs += calcs[i]["s_ns_labels"].data.tolist()

        out = np.array(out)
        labs = np.array(labs)
        out[out > 0.5] = 1
        out[out <= 0.5] = 0
        if len(out) != 0:
            # fpr, tpr, thresholds = metrics.roc_curve(labs2, out2)
            # acc_s = metrics.auc(fpr, tpr)
            acc_s =  metrics.f1_score(labs, out, average="micro")
        else:
            acc_s = np.nan

        return np.nanmean(np.array([acc_cn, acc_f, acc_s])), acc_cn, acc_f, acc_s


    def accuracy_regular(self,calcs):
        out, labs = ([calcs[i]["cn_out"].data[0].tolist() for i in range(len(calcs))],
                     [calcs[i]["cn_label"].data[0].tolist() for i in range(len(calcs))])
        acc_cn = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs)

        out = []
        labs = []
        for i in range(len(calcs)):
            out += calcs[i]["f_ns_out"].data.tolist()
            labs += calcs[i]["f_ns_labels"].data.tolist()
        if len(out) != 0:
            acc_f = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs)
        else:
            acc_f = np.nan

        out = []
        labs = []
        for i in range(len(calcs)):
            out += calcs[i]["s_ns_out"].data.tolist()
            labs += calcs[i]["s_ns_labels"].data.tolist()

        if len(out) != 0:
            acc_s = sum(np.argmax(np.array(out), axis=1) == labs) / len(labs)
        else:
            acc_s = np.nan

        return np.nanmean(np.array([acc_cn, acc_f, acc_s])), acc_cn, acc_f, acc_s




    def test(self, model=None, verbose=2, print_to_file=False):
        model_ = model["model"]
        test_indices = model["test_ind"]
        labels = model["labels"]
        beta = model["beta"]
        gamma = model["gamma"]
        model_.eval()

        test_loss = 0
        calcs_batch=[]
        with torch.no_grad():
            for node in test_indices:
                # adj = nx.ego_graph(model["adj_matrices"], node, radius=2)
                adj = model["adj_te"][node]
                X_t = model["X"][list(adj.nodes)].to(device=self._device)
                output = model_(X_t, nx.adjacency_matrix(adj).tocoo())
                calcs = self.calculate_labels_outputs(node, output, labels, model["testt"], adj)
                test_loss += self._loss(calcs, beta, gamma).data.item()
                calcs_batch.append(calcs)

            test_loss /= len(test_indices)
            test_acc, acc_test_cn, acc_test_f, acc_test_s = self.accuracy(calcs_batch)

            if verbose != 0:
                self._logger.info("Test: ce_loss= {:.4f} ".format(test_loss) + "acc= {:.4f}".format(test_acc))


            result = {"loss": test_loss, "acc": test_acc, "acc_cn": acc_test_cn, "acc_f":acc_test_f, "acc_s":acc_test_s}
            return result
示例#30
0
class GraphFeatures(dict):
    def __init__(self,
                 gnx,
                 features,
                 dir_path,
                 logger=None,
                 is_max_connected=False):
        self._base_dir = dir_path
        self._logger = EmptyLogger() if logger is None else logger
        self._matrix = None

        self._gnx = get_max_subgraph(gnx) if is_max_connected else gnx

        self._abbreviations = {
            abbr: name
            for name, meta in features.items() for abbr in meta.abbr_set
        }

        # building the feature calculators data structure
        super(GraphFeatures, self).__init__({
            name: meta.calculator(self._gnx, logger=logger)
            for name, meta in features.items()
        })

    @property
    def graph(self):
        return self._gnx

    def _build_serially(self,
                        include,
                        force_build: bool = False,
                        dump_path: str = None):
        if VERBOSE:
            self._logger.debug("Start building graph features")
        if dump_path is not None and self._gnx is not None:
            pickle.dump(self._gnx,
                        open(self._feature_path("gnx", dump_path), "wb"))
        for name, feature in self.items():
            if force_build or not os.path.exists(self._feature_path(name)):
                is_dumped = dump_path is not None and feature.DUMPABLE
                msg = "Dumped to: %s" % dump_path if is_dumped else "Not dumped"
                feature.build(include=include, msg=msg)
                if is_dumped:
                    self._dump_feature(name, feature, dump_path)
            else:
                self._load_feature(name)
        if VERBOSE:
            self._logger.debug("Finished building graph features")

    # a single process means it is calculated serially
    def build(self,
              num_processes: int = 1,
              include: set = None,
              should_dump: bool = False):  # , exclude: set=None):
        # if exclude is None:
        #     exclude = set()
        if include is None:
            include = set()

        if 1 == num_processes:
            dump_path = None
            if should_dump:
                dump_path = self._base_dir
                if not os.path.exists(dump_path):
                    os.makedirs(dump_path)
            return self._build_serially(include, dump_path=dump_path)

        request_queue = Queue()
        workers = [
            Worker(request_queue, self, include, logger=self._logger)
            for _ in range(num_processes)
        ]
        # Starting all workers
        for worker in workers:
            worker.start()

        # Feeding the queue with all the features
        for feature_name in self:
            request_queue.put(feature_name)

        # Sentinel objects to allow clean shutdown: 1 per worker.
        for _ in range(num_processes):
            request_queue.put(None)

        # Joining all workers
        for worker in workers:
            worker.join()

    def _load_feature(self, name):
        if self._gnx is None:
            assert os.path.exists(self._feature_path(
                "gnx")), "Graph is not present in the given directory"
            self._gnx = pickle.load(open(self._feature_path("gnx"), "rb"))
        feature = pickle.load(open(self._feature_path(name), "rb"))
        feature.load_meta({
            name: getattr(self, name)
            for name in FeatureCalculator.META_VALUES
        })
        self[name] = feature
        return self[name]

    def __getattr__(self, name):
        if name not in self:
            if name in self._abbreviations:
                name = self._abbreviations[name]
            else:
                return super(GraphFeatures, self).__getattribute__(name)

        # if obj is already calculated - return it
        obj = self[name]
        if obj.is_loaded:
            return obj

        # if obj is not calculated, check if it exist on the file system
        # if it doesn't - calculate it, if it does - load it and return it
        if not os.path.exists(self._feature_path(name)):
            obj.build()
            return obj

        return self._load_feature(name)

    @property
    def features(self):
        return set(self)

    def _feature_path(self, name, dir_path=None):
        if dir_path is None:
            dir_path = self._base_dir
        return os.path.join(dir_path, name + ".pkl")

    def _dump_feature(self, name, feature, dir_path):
        if feature.is_loaded:
            prev_meta = feature.clean_meta(
            )  # in order not to save unnecessary data
            pickle.dump(feature, open(self._feature_path(name, dir_path),
                                      "wb"))
            feature.load_meta(prev_meta)

    def dump(self, dir_path=None):
        if dir_path is None:
            dir_path = self._base_dir

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        for name, feature in self.items():
            self._dump_feature(name, feature, dir_path)

    @property
    def shape(self):
        sorted_features = map(at(1), sorted(self.items(), key=at(0)))
        sorted_features = [
            feature for feature in sorted_features
            if feature.is_relevant() and feature.is_loaded
        ]
        res = []
        for feature in sorted_features:
            res.append((feature.print_name()), feature.shape[1])
        return res

    # sparse.csr_matrix(matrix, dtype=np.float32)
    def to_matrix(self,
                  entries_order: list = None,
                  add_ones=False,
                  dtype=None,
                  mtype=np.matrix,
                  should_zscore: bool = True):
        if entries_order is None:
            entries_order = sorted(self._gnx)

        sorted_features = map(at(1), sorted(self.items(), key=at(0)))
        # Consider caching the matrix creation (if it takes long time)
        sorted_features = [
            feature for feature in sorted_features
            if feature.is_relevant() and feature.is_loaded
        ]

        if sorted_features:
            mx = np.hstack([
                feature.to_matrix(entries_order,
                                  mtype=mtype,
                                  should_zscore=should_zscore)
                for feature in sorted_features
            ])
            if add_ones:
                mx = np.hstack([mx, np.ones((mx.shape[0], 1))])
            mx.astype(dtype)
        else:
            mx = np.matrix([])

        return mtype(mx)

    def to_dict(self, dtype=None, should_zscore: bool = True):
        mx = self.to_matrix(dtype=dtype,
                            mtype=np.matrix,
                            should_zscore=should_zscore)
        return {node: mx[i, :] for i, node in enumerate(sorted(self._gnx))}