def load_data(dataset="Cora"): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data', dataset) if dataset in ["Cora", "Citeseer", "Pubmed"]: data = Planetoid(path, dataset, T.NormalizeFeatures())[0] num_nodes = data.x.size(0) edge_index, _ = remove_self_loops(data.edge_index) edge_index = add_self_loops(edge_index, num_nodes=num_nodes) if isinstance(edge_index, tuple): data.edge_index = edge_index[0] else: data.edge_index = edge_index return data else: raise Exception(f'the dataset of {dataset} has not been implemented')
def load_data(dataset_name): if dataset_name in ['cora', 'citeseer', 'pubmed']: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', dataset_name) data = Planetoid(path, dataset_name)[0] else: data = load_wiki.load_data() data.edge_index = gutils.to_undirected(data.edge_index) data = GAE.split_edges(GAE, data) features = data.x.numpy() train_pos_edges = data.train_pos_edge_index.numpy() train_neg_edges = sample_negative(count=train_pos_edges.shape[1], avoid=train_pos_edges, nodes=features.shape[0]) x_tr, y_tr = combine_node_pair_features(features, train_pos_edges, train_neg_edges) x_val, y_val = combine_node_pair_features(features, data.val_pos_edge_index.numpy(), data.val_neg_edge_index.numpy()) x_test, y_test = combine_node_pair_features( features, data.test_pos_edge_index.numpy(), data.test_neg_edge_index.numpy()) return x_tr, y_tr, x_val, y_val, x_test, y_test
def load_data(dataset="Cora"): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "data", dataset) if dataset in ["Cora", "Citeseer", "Pubmed"]: data = Planetoid(path, dataset, split="public", transform=T.NormalizeFeatures())[0] num_nodes = data.x.size(0) edge_index, _ = remove_self_loops(data.edge_index) edge_index = add_self_loops(edge_index, num_nodes=num_nodes) if isinstance(edge_index, tuple): data.edge_index = edge_index[0] # !!! 2*N 新版可能有改变 else: data.edge_index = edge_index return data elif dataset in ["CoauthorCS"]: data = Coauthor(path, "cs", transform=T.NormalizeFeatures())[0] num_nodes = data.x.size(0) edge_index, _ = remove_self_loops(data.edge_index) edge_index = add_self_loops(edge_index, num_nodes=num_nodes) if isinstance(edge_index, tuple): data.edge_index = edge_index[0] else: data.edge_index = edge_index # devide training validation and testing set train_mask = torch.zeros((num_nodes, ), dtype=torch.bool) val_mask = torch.zeros((num_nodes, ), dtype=torch.bool) test_mask = torch.zeros((num_nodes, ), dtype=torch.bool) train_num = 40 val_num = 150 for i in range(15): # number of labels index = (data.y == i).nonzero()[:, 0] perm = torch.randperm(index.size(0)) train_mask[index[perm[:train_num]]] = 1 val_mask[index[perm[train_num:(train_num + val_num)]]] = 1 test_mask[index[perm[(train_num + val_num):]]] = 1 data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data else: raise Exception(f"the dataset of {dataset} has not been implemented")
parser = argparse.ArgumentParser(description='VGAE') parser.add_argument('--dataset') parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--val-freq', type=int, default=20) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--test', action='store_true', default=False) args = parser.parse_args() if args.dataset in ['cora', 'citeseer', 'pubmed']: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', args.dataset) data = Planetoid(path, args.dataset)[0] else: data = load_wiki.load_data() data.edge_index = gutils.to_undirected(data.edge_index) data = GAE.split_edges(GAE, data) num_features = data.x.shape[1] aucs = [] aps = [] for run in range(args.runs): model = VGAE(VGAE_Encoder(num_features)) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Training loop for epoch in range(args.epochs): model.train() optimizer.zero_grad() z = model.encode(data.x, data.train_pos_edge_index) loss = model.recon_loss(
def data_downloader(dataset='Cora', data_dir='../data', data_type='static'): ''' グラフデータをダウンロードする. Parameters: dataset (:obj:`str`): データセット名.'Cora', 'CiteSeer', 'factset' Returens: data (torch_geometric.data.Data): グラフデータ. ''' if dataset in ['Cora', 'CiteSeer', 'PubMed']: data = Planetoid(data_dir, dataset, transform=T.NormalizeFeatures())[0] elif 'Factset' in dataset: year = dataset[-4:] print(f'processing Factset in year {year}.') if data_type == 'dynamic': df = pd.read_csv( data_dir + f'/Factset/node_features_{year}_dynamic_processed.csv' ).drop_duplicates(ignore_index=True, subset='code') else: df = pd.read_csv(data_dir + f'/Factset/node_features_{year}_processed.csv' ).drop_duplicates(ignore_index=True, subset='code') N = len(df) # ノード数 # sec_codeとノード番号の対応付け dic = {} for row in df.itertuples(): dic[row[1]] = row[0] edge = pd.read_csv(data_dir + f'/Factset/edges_{year}.csv', usecols=[ 'REL_TYPE', 'SOURCE_COMPANY_TICKER', 'TARGET_COMPANY_TICKER' ]).rename(columns={ 'SOURCE_COMPANY_TICKER': 'source', 'TARGET_COMPANY_TICKER': 'target' }) edge = edge[(edge['REL_TYPE'] == 'CUSTOMER') | (edge['REL_TYPE'] == 'SUPPLIER')] edge = edge[['source', 'target']].drop_duplicates(ignore_index=True, subset=['source', 'target']) for i in range(edge.shape[0]): if i in edge.index: source = edge.loc[i, 'source'] target = edge.loc[i, 'target'] edge = edge.drop(edge[(edge['source'] == target) & (edge['target'] == source)].index) edge = edge.applymap(lambda x: dic[x] if x in dic.keys() else np.nan) edge = edge.dropna(how='any').reset_index(drop=True) # 欠損値の処理 df = df.iloc[:, 5:] # sec_codeは除く # df = df.dropna(thresh=100, axis=1) # NaNでないデータがthresh個以上なら削除しない df = df.fillna(0) # その他の列は平均で補完 df = (df - df.mean()) / df.std() df = df.fillna(0) # X to tensor X = [[] for _ in range(N)] for row in df.itertuples(): X[row[0]] = row[1:] X = torch.tensor(X, dtype=torch.float) # edge_index to tensor edge_index = torch.tensor(edge.to_numpy().T, dtype=torch.long) # torch_geometric.data.Data data = Data(x=X, edge_index=edge_index) print(f'dataset {dataset} has been downloaded.') print(f'is undirected: {data.is_undirected()}') print(f'contains self loops: {data.contains_self_loops()}') print(f'num_nodes: {data.num_nodes}') print(f'num_edges: {data.num_edges}\n') if data.is_undirected() is False: data.edge_index = to_undirected(data.edge_index) print('The graph has been transformed into undirected one.') return data