Exemplo n.º 1
0
def preprocess_citation_bigraph(adj, features, normalization="FirstOrderGCN"):
    adj_normalizer = fetch_normalization(normalization)
    adj = adj_normalizer(adj)
    adj_cn = features.T
    features = row_normalize(features)
    adj_cn = row_normalize(adj_cn)
    adj_nc = features
    return adj, features, adj_nc, adj_cn
Exemplo n.º 2
0
def get_hops(adj, n_hops, args):
    # adj is csr_matrix, n_hops
    hop_file = 'hop_adj/' + args.dataset + '_hop_{}'.format(
        args.nlayer) + '.pickle'

    if os.path.isfile(hop_file):
        with open(hop_file, 'rb') as f:
            adj_result = pickle.load(f)
    else:
        n_node, _ = adj.shape
        adj = adj - sp.dia_matrix(
            (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape, dtype=float)
        adj_orig = adj
        adj_result = []
        for i in range(n_hops):
            if i == 0:
                adj_ = adj_orig.tocoo()
            else:
                adj = sp.csr_matrix(adj.dot(adj_orig.toarray().T))
                adj_ = adj
            print('---> Sparse rate of %d is : %.4f' %
                  (i + 1, adj_.nnz / n_node / n_node))
            adj_ = row_normalize(adj_)
            adj_result.append(adj_)

        with open(hop_file, 'wb') as pfile:
            pickle.dump(adj_result, pfile, pickle.HIGHEST_PROTOCOL)
    return adj_result
Exemplo n.º 3
0
def preprocess_citation_graph(adj, features, normalization="FirstOrderGCN"):
    adj_normalizer = fetch_normalization(normalization)
    adj = adj_normalizer(adj)
    adj2 = features * features.T
    adj2 = adj_normalizer(adj2)
    features = row_normalize(features)
    return adj, features, adj2
Exemplo n.º 4
0
def preprocess_citation(adj, features, normalization="FirstOrderGCN", gamma=1):
    adj_normalizer = fetch_normalization(normalization)
    if 'Aug' in normalization:
        adj = adj_normalizer(adj, gamma=gamma)
    else:
        adj = adj_normalizer(adj)
    features = row_normalize(features)
    return adj, features
Exemplo n.º 5
0
def load_data(dataset_str="cora",
              normalization=[],
              feat_normalize=True,
              cuda=False,
              split="default",
              random_state=None,
              **kwargs):
    """
    Load pickle packed datasets.
    """
    with open(dataf+dataset_str+".graph", "rb") as f:
        graph = pkl.load(f)
    with open(dataf+dataset_str+".X", "rb") as f:
        X = pkl.load(f)
    with open(dataf+dataset_str+".y", "rb") as f:
        y = pkl.load(f)
    if split != "default":
        tr_size, va_size, te_size = [float(i) for i in split.split("_")]
        idx_train, idx_val, idx_test = \
            train_val_test_split(np.arange(len(y)), train_size=tr_size,
                                 val_size=va_size, test_size=te_size,
                                 stratify=y, random_state=random_state) 
    else:
        with open(dataf+dataset_str+".split", "rb") as f:
            split = pkl.load(f)
            idx_train = split['train']
            idx_test = split['test']
            idx_val = split['valid']

    normed_adj = []
    if len(normalization) > 0:
        adj = nx.adj_matrix(graph)
        for n in normalization:
            nf = fetch_normalization(n, **kwargs)
            normed_adj.append(nf(adj))

    if feat_normalize:
        X = row_normalize(X)

    X = torch.FloatTensor(X).float()
    y = torch.LongTensor(y)
    normed_adj = [sparse_mx_to_torch_sparse_tensor(adj).float() \
                  for adj in normed_adj]
    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    if cuda:
        X = X.cuda()
        normed_adj = [adj.cuda() for adj in normed_adj]
        y = y.cuda()
        idx_train = idx_train.cuda()
        idx_val = idx_val.cuda()
        idx_test = idx_test.cuda()

    return graph, normed_adj, X, y, idx_train, idx_val, idx_test
Exemplo n.º 6
0
def preprocess_citation(adj, features, normalization, extra=None):
    adj_normalizer = fetch_normalization(normalization, extra)
    adj = adj_normalizer(adj)
    #row_sum = 1 / (np.sqrt(np.array(adj.sum(1))))
    #row_sum = np.array(adj.sum(1))
    #features = row_sum
    #features = features.todense()
    #features = np.concatenate([features, row_sum], axis=1) 
    #features = sp.lil_matrix(features)
    if normalization != "":
        features = row_normalize(features)
    return adj, features
Exemplo n.º 7
0
def preprocess_citation(adj, features, normalization='AugNormAdj'):
    adj_normalizer = fetch_normalization(normalization)
    adj = adj_normalizer(adj)
    features = row_normalize(features)
    return adj, features
Exemplo n.º 8
0
def preprocess_citation_feat(features):
    features = row_normalize(features)
    return features
Exemplo n.º 9
0
def preprocess_citation(adj, features, normalization="FirstOrderGCN"):
    adj_normalizer = fetch_normalization(normalization)
    #features, Droot = row_normalize(features,adj)
    features = row_normalize(features)
    adj = adj_normalizer(adj)
    return adj, features
Exemplo n.º 10
0
def load_citation_gac(dataset_str="cora", semi=1):
    """
    Load Citation Networks Datasets.
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset_str.lower(), names[i]),
                  'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file(
        "data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder),
                                    max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range - min(test_idx_range), :] = ty
        ty = ty_extended
        features = sp.vstack((allx, tx)).tolil()
        features[test_idx_reorder, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
        adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y) + 500)

        features = row_normalize(features)

    else:
        features = sp.vstack((allx, tx)).tolil()
        features[test_idx_reorder, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
        adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y) + 500)

        features = row_normalize(features)

    # porting to pytorch
    # features = torch.FloatTensor(np.array(features.todense())).float()

    features = sparse_mx_to_torch_sparse_tensor(features).float()

    labels = torch.LongTensor(labels)
    labels = torch.max(labels, dim=1)[1]
    # adj = sparse_mx_to_torch_sparse_tensor(adj).float()

    if semi == 0:
        idx_all = list(range(labels.shape[0]))
        used_all = set(idx_train).union(set(idx_val)).union(set(idx_test))
        unused_all = list(set(idx_all).difference(used_all))
        idx_train = list(idx_train) + unused_all

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test
Exemplo n.º 11
0
def load_webANEmat_gac(dataset_str="texas", semi=1, semi_rate=0.1):
    data_file = 'data/{}/{}'.format('web', dataset_str) + '.mat'
    file_train = 'data/{}/{}_train'.format('web', dataset_str) + '.pickle'
    file_valid = 'data/{}/{}_valid'.format('web', dataset_str) + '.pickle'
    file_test = 'data/{}/{}_test'.format('web', dataset_str) + '.pickle'
    data = scio.loadmat(data_file)
    features = data['Attributes']
    labels = data['Label'].reshape(-1)
    adj = data['Network']
    features = row_normalize(features)

    if (adj != adj.T).sum() != 0:
        adj = adj + adj.T
    if np.any(np.unique(adj[adj.nonzero()].A1) != 1):
        adj.data = np.ones_like(adj.data)

    label_min = np.min(labels)
    if label_min != 0:
        labels = labels - 1

    with open(file_test, 'rb') as f:
        idx_test = pickle.load(f)
    with open(file_valid, 'rb') as f:
        idx_val = pickle.load(f)
    with open(file_train, 'rb') as f:
        idx_train = pickle.load(f)
    if semi == 1:
        train_idx_file = 'data/' + 'web' + '/' + dataset_str + '_train_{}'.format(
            semi_rate) + '.pickle'
        valid_idx_file = 'data/' + 'web' + '/' + dataset_str + '_valid_{}'.format(
            semi_rate) + '.pickle'
        test_idx_file = 'data/' + 'web' + '/' + dataset_str + '_test_{}'.format(
            semi_rate) + '.pickle'
        if os.path.isfile(train_idx_file):
            with open(test_idx_file, 'rb') as f:
                idx_test = pickle.load(f)
            with open(valid_idx_file, 'rb') as f:
                idx_val = pickle.load(f)
            with open(train_idx_file, 'rb') as f:
                idx_train = pickle.load(f)
        else:
            mask = np.unique(labels)
            label_count = [np.sum(labels == v) for v in mask]
            idx_train = []
            idx_val = []
            idx_test = []
            for i, v in enumerate(mask):
                cnt = label_count[i]
                idx_all = np.where(labels == v)[0]
                np.random.shuffle(idx_all)
                idx_all = idx_all.tolist()
                test_len = math.ceil(cnt * 0.2)
                valid_len = math.ceil(cnt * 0.2)
                train_len = math.ceil(cnt * semi_rate)
                idx_test.extend(idx_all[-test_len:])
                idx_val.extend(idx_all[-(test_len + valid_len):-test_len])
                train_len_ = min(train_len, cnt - test_len - valid_len)
                idx_train.extend(idx_all[:train_len_])

            idx_train = np.array(idx_train)
            idx_val = np.array(idx_val)
            idx_test = np.array(idx_test)

            with open(train_idx_file, 'wb') as pfile:
                pickle.dump(idx_train, pfile, pickle.HIGHEST_PROTOCOL)
            with open(test_idx_file, 'wb') as pfile:
                pickle.dump(idx_test, pfile, pickle.HIGHEST_PROTOCOL)
            with open(valid_idx_file, 'wb') as pfile:
                pickle.dump(idx_val, pfile, pickle.HIGHEST_PROTOCOL)

    features = sparse_mx_to_torch_sparse_tensor(features).float()

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)
    labels = torch.LongTensor(labels)

    return adj, features, labels, idx_train, idx_val, idx_test
Exemplo n.º 12
0
def load_citationANEmat_gac(dataset_str="BlogCatalog", semi_rate=0.1):
    data_file = 'data/{}/{}'.format('social', dataset_str) + '.mat'
    data = scio.loadmat(data_file)
    if dataset_str == 'ACM':
        features = data['Features']
    else:
        features = data['Attributes']
    labels = data['Label'].reshape(-1)
    adj = data['Network']
    # adj, features = preprocess_citation(adj, features, normalization)
    features = row_normalize(features)

    label_min = np.min(labels)
    if label_min != 0:
        labels = labels - 1

    train_idx_file = 'data/' + 'social' + '/' + dataset_str + '_train_{}'.format(
        semi_rate) + '.pickle'
    valid_idx_file = 'data/' + 'social' + '/' + dataset_str + '_valid_{}'.format(
        semi_rate) + '.pickle'
    test_idx_file = 'data/' + 'social' + '/' + dataset_str + '_test_{}'.format(
        semi_rate) + '.pickle'
    if os.path.isfile(train_idx_file):
        with open(test_idx_file, 'rb') as f:
            idx_test = pickle.load(f)
        with open(valid_idx_file, 'rb') as f:
            idx_val = pickle.load(f)
        with open(train_idx_file, 'rb') as f:
            idx_train = pickle.load(f)
    else:
        mask = np.unique(labels)
        label_count = [np.sum(labels == v) for v in mask]
        idx_train = []
        idx_val = []
        idx_test = []
        for i, v in enumerate(mask):
            cnt = label_count[i]
            idx_all = np.where(labels == v)[0]
            np.random.shuffle(idx_all)
            idx_all = idx_all.tolist()
            test_len = math.ceil(cnt * 0.2)
            valid_len = math.ceil(cnt * 0.2)
            train_len = math.ceil(cnt * semi_rate)
            idx_test.extend(idx_all[-test_len:])
            idx_val.extend(idx_all[-(test_len + valid_len):-test_len])
            train_len_ = min(train_len, cnt - test_len - valid_len)
            idx_train.extend(idx_all[:train_len_])

        idx_train = np.array(idx_train)
        idx_val = np.array(idx_val)
        idx_test = np.array(idx_test)

        with open(train_idx_file, 'wb') as pfile:
            pickle.dump(idx_train, pfile, pickle.HIGHEST_PROTOCOL)
        with open(test_idx_file, 'wb') as pfile:
            pickle.dump(idx_test, pfile, pickle.HIGHEST_PROTOCOL)
        with open(valid_idx_file, 'wb') as pfile:
            pickle.dump(idx_val, pfile, pickle.HIGHEST_PROTOCOL)
    # adj = sparse_mx_to_torch_sparse_tensor(adj).float()
    features = sparse_mx_to_torch_sparse_tensor(features).float()

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)
    labels = torch.LongTensor(labels)

    return adj, features, labels, idx_train, idx_val, idx_test
Exemplo n.º 13
0
def preprocess_citation(adj, features, normalization='FIrstOrderGCN'):
    adj_normalizer = fetch_normalization(normalization)
    adj = adj_normalizer(adj)
    features = row_normalize(features)
    return adj, features