示例#1
0
def load_data(dataset="Cora"):
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data', dataset)
    if dataset in ["Cora", "Citeseer", "Pubmed"]:
        data = Planetoid(path, dataset, T.NormalizeFeatures())[0]
        num_nodes = data.x.size(0)
        edge_index, _ = remove_self_loops(data.edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=num_nodes)
        if isinstance(edge_index, tuple):
            data.edge_index = edge_index[0]
        else:
            data.edge_index = edge_index
        return data
    else:
        raise Exception(f'the dataset of {dataset} has not been implemented')
示例#2
0
def load_data(dataset_name):
    if dataset_name in ['cora', 'citeseer', 'pubmed']:
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.',
                            'data', dataset_name)
        data = Planetoid(path, dataset_name)[0]
    else:
        data = load_wiki.load_data()

    data.edge_index = gutils.to_undirected(data.edge_index)
    data = GAE.split_edges(GAE, data)

    features = data.x.numpy()
    train_pos_edges = data.train_pos_edge_index.numpy()
    train_neg_edges = sample_negative(count=train_pos_edges.shape[1],
                                      avoid=train_pos_edges,
                                      nodes=features.shape[0])

    x_tr, y_tr = combine_node_pair_features(features, train_pos_edges,
                                            train_neg_edges)
    x_val, y_val = combine_node_pair_features(features,
                                              data.val_pos_edge_index.numpy(),
                                              data.val_neg_edge_index.numpy())
    x_test, y_test = combine_node_pair_features(
        features, data.test_pos_edge_index.numpy(),
        data.test_neg_edge_index.numpy())
    return x_tr, y_tr, x_val, y_val, x_test, y_test
示例#3
0
def load_data(dataset="Cora"):
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                        "data", dataset)
    if dataset in ["Cora", "Citeseer", "Pubmed"]:
        data = Planetoid(path,
                         dataset,
                         split="public",
                         transform=T.NormalizeFeatures())[0]
        num_nodes = data.x.size(0)
        edge_index, _ = remove_self_loops(data.edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=num_nodes)
        if isinstance(edge_index, tuple):
            data.edge_index = edge_index[0]  # !!! 2*N 新版可能有改变
        else:
            data.edge_index = edge_index
        return data
    elif dataset in ["CoauthorCS"]:
        data = Coauthor(path, "cs", transform=T.NormalizeFeatures())[0]
        num_nodes = data.x.size(0)
        edge_index, _ = remove_self_loops(data.edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=num_nodes)
        if isinstance(edge_index, tuple):
            data.edge_index = edge_index[0]
        else:
            data.edge_index = edge_index

        # devide training validation and testing set
        train_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        val_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        test_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        train_num = 40
        val_num = 150
        for i in range(15):  # number of labels
            index = (data.y == i).nonzero()[:, 0]
            perm = torch.randperm(index.size(0))
            train_mask[index[perm[:train_num]]] = 1
            val_mask[index[perm[train_num:(train_num + val_num)]]] = 1
            test_mask[index[perm[(train_num + val_num):]]] = 1
        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask
        return data
    else:
        raise Exception(f"the dataset of {dataset} has not been implemented")
示例#4
0
    parser = argparse.ArgumentParser(description='VGAE')
    parser.add_argument('--dataset')
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--val-freq', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--test', action='store_true', default=False)
    args = parser.parse_args()

    if args.dataset in ['cora', 'citeseer', 'pubmed']:
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.',
                            'data', args.dataset)
        data = Planetoid(path, args.dataset)[0]
    else:
        data = load_wiki.load_data()

    data.edge_index = gutils.to_undirected(data.edge_index)
    data = GAE.split_edges(GAE, data)

    num_features = data.x.shape[1]
    aucs = []
    aps = []
    for run in range(args.runs):
        model = VGAE(VGAE_Encoder(num_features))
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # Training loop
        for epoch in range(args.epochs):
            model.train()
            optimizer.zero_grad()
            z = model.encode(data.x, data.train_pos_edge_index)
            loss = model.recon_loss(
def data_downloader(dataset='Cora', data_dir='../data', data_type='static'):
    '''
    グラフデータをダウンロードする.

    Parameters:
        dataset (:obj:`str`): データセット名.'Cora', 'CiteSeer', 'factset'

    Returens:
        data (torch_geometric.data.Data): グラフデータ.
    '''

    if dataset in ['Cora', 'CiteSeer', 'PubMed']:
        data = Planetoid(data_dir, dataset, transform=T.NormalizeFeatures())[0]

    elif 'Factset' in dataset:
        year = dataset[-4:]
        print(f'processing Factset in year {year}.')
        if data_type == 'dynamic':
            df = pd.read_csv(
                data_dir +
                f'/Factset/node_features_{year}_dynamic_processed.csv'
            ).drop_duplicates(ignore_index=True, subset='code')
        else:
            df = pd.read_csv(data_dir +
                             f'/Factset/node_features_{year}_processed.csv'
                             ).drop_duplicates(ignore_index=True,
                                               subset='code')
        N = len(df)  # ノード数

        # sec_codeとノード番号の対応付け
        dic = {}
        for row in df.itertuples():
            dic[row[1]] = row[0]

        edge = pd.read_csv(data_dir + f'/Factset/edges_{year}.csv',
                           usecols=[
                               'REL_TYPE', 'SOURCE_COMPANY_TICKER',
                               'TARGET_COMPANY_TICKER'
                           ]).rename(columns={
                               'SOURCE_COMPANY_TICKER': 'source',
                               'TARGET_COMPANY_TICKER': 'target'
                           })
        edge = edge[(edge['REL_TYPE'] == 'CUSTOMER') |
                    (edge['REL_TYPE'] == 'SUPPLIER')]
        edge = edge[['source',
                     'target']].drop_duplicates(ignore_index=True,
                                                subset=['source', 'target'])

        for i in range(edge.shape[0]):
            if i in edge.index:
                source = edge.loc[i, 'source']
                target = edge.loc[i, 'target']
                edge = edge.drop(edge[(edge['source'] == target)
                                      & (edge['target'] == source)].index)

        edge = edge.applymap(lambda x: dic[x] if x in dic.keys() else np.nan)
        edge = edge.dropna(how='any').reset_index(drop=True)

        # 欠損値の処理
        df = df.iloc[:, 5:]  # sec_codeは除く
        # df = df.dropna(thresh=100, axis=1) # NaNでないデータがthresh個以上なら削除しない
        df = df.fillna(0)  # その他の列は平均で補完
        df = (df - df.mean()) / df.std()
        df = df.fillna(0)

        # X to tensor
        X = [[] for _ in range(N)]
        for row in df.itertuples():
            X[row[0]] = row[1:]
        X = torch.tensor(X, dtype=torch.float)

        # edge_index to tensor
        edge_index = torch.tensor(edge.to_numpy().T, dtype=torch.long)

        # torch_geometric.data.Data
        data = Data(x=X, edge_index=edge_index)

    print(f'dataset {dataset} has been downloaded.')
    print(f'is undirected: {data.is_undirected()}')
    print(f'contains self loops: {data.contains_self_loops()}')
    print(f'num_nodes: {data.num_nodes}')
    print(f'num_edges: {data.num_edges}\n')

    if data.is_undirected() is False:
        data.edge_index = to_undirected(data.edge_index)
        print('The graph has been transformed into undirected one.')

    return data