def test_metapath2vec(): edge_index_dict = { ('author', 'writes', 'paper'): torch.tensor([[0, 1, 1, 2], [0, 0, 1, 1]]), ('paper', 'written_by', 'author'): torch.tensor([[0, 0, 1, 1], [0, 1, 1, 2]]) } metapath = [ ('author', 'writes', 'paper'), ('paper', 'written_by', 'author'), ] model = MetaPath2Vec(edge_index_dict, embedding_dim=16, metapath=metapath, walk_length=2, context_size=2) assert model.__repr__() == 'MetaPath2Vec(5, 16)' z = model('author') assert z.size() == (3, 16) z = model('paper') assert z.size() == (2, 16) z = model('author', torch.arange(2)) assert z.size() == (2, 16) pos_rw, neg_rw = model.sample(torch.arange(3)) loss = model.loss(pos_rw, neg_rw) assert 0 <= loss.item() acc = model.test(torch.ones(20, 16), torch.randint(10, (20, )), torch.ones(20, 16), torch.randint(10, (20, ))) assert 0 <= acc and acc <= 1
print(pyg_data) metapath = [('author', 'writes', 'paper'), ('paper', 'has_topic', 'field_of_study'), ('field_of_study', 'rev_has_topic', 'paper'), ('paper', 'rev_cites', 'paper'), ('paper', 'rev_writes', 'author'), ('author', 'affiliated_with', 'institution'), ('institution', 'rev_affiliated_with', 'author'), ('author', 'writes', 'paper'), ('paper', 'cites', 'paper'), ('paper', 'rev_writes', 'author')] metapath2vec_model = MetaPath2Vec( pyg_data.edge_index_dict, embedding_dim=args['embedding_dim'], metapath=metapath, walk_length=args['walk_length'], context_size=args['context_size'], walks_per_node=args['walks_per_node'], num_negative_samples=args['num_negative_samples']).to(args['cuda']) loader = metapath2vec_model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.Adam(metapath2vec_model.parameters(), lr=0.01) metapath2vec_model.train() for epoch in range(1, args['epochs'] + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = metapath2vec_model.loss(pos_rw.to(args['cuda']), neg_rw.to(args['cuda']))
path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/AMiner') dataset = AMiner(path) data = dataset[0] metapath = [ ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper'), ('paper', 'written_by', 'author'), ] device = 'cuda' if torch.cuda.is_available() else 'cpu' model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128, metapath=metapath, walk_length=50, context_size=7, walks_per_node=5, num_negative_samples=5, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=6) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01) def train(epoch, log_steps=100, eval_steps=2000): model.train() total_loss = 0 for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device))
def main(): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'AMiner') dataset = AMiner(path) data = dataset[0] print(data) metapath = [ ('author', 'wrote', 'paper'), ('paper', 'published in', 'venue'), ('venue', 'published', 'paper'), ('paper', 'written by', 'author'), ] device = 'cuda' if torch.cuda.is_available() else 'cpu' model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128, metapath=metapath, walk_length=50, context_size=7, walks_per_node=5, num_negative_samples=5, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=12) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01) def train(epoch, log_steps=100, eval_steps=2000): model.train() total_loss = 0 for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() if (i + 1) % log_steps == 0: print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, ' f'Loss: {total_loss / log_steps:.4f}')) total_loss = 0 if (i + 1) % eval_steps == 0: acc = test() print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, ' f'Acc: {acc:.4f}')) @torch.no_grad() def test(train_ratio=0.1): model.eval() z = model('author', batch=data.y_index_dict['author']) y = data.y_dict['author'] perm = torch.randperm(z.size(0)) train_perm = perm[:int(z.size(0) * train_ratio)] test_perm = perm[int(z.size(0) * train_ratio):] return model.test(z[train_perm], y[train_perm], z[test_perm], y[test_perm], max_iter=150) for epoch in range(1, 6): train(epoch) acc = test() print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')
def main(): parser = argparse.ArgumentParser(description='OGBN-MAG (MetaPath2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--walk_length', type=int, default=64) parser.add_argument('--context_size', type=int, default=7) parser.add_argument('--walks_per_node', type=int, default=5) parser.add_argument('--num_negative_samples', type=int, default=5) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=5) parser.add_argument('--log_steps', type=int, default=100) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset('ogbn-mag') data = dataset[0] # We need to add reverse edges to the heterogeneous graph. data.edge_index_dict[('institution', 'employs', 'author')] = transpose( data.edge_index_dict[('author', 'affiliated_with', 'institution')], None, m=data.num_nodes_dict['author'], n=data.num_nodes_dict['institution'])[0] data.edge_index_dict[('paper', 'written_by', 'author')] = transpose( data.edge_index_dict[('author', 'writes', 'paper')], None, m=data.num_nodes_dict['author'], n=data.num_nodes_dict['paper'])[0] data.edge_index_dict[('field_of_study', 'contains', 'paper')] = transpose( data.edge_index_dict[('paper', 'has_topic', 'field_of_study')], None, m=data.num_nodes_dict['paper'], n=data.num_nodes_dict['field_of_study'])[0] print(data) metapath = [ ('author', 'writes', 'paper'), ('paper', 'has_topic', 'field_of_study'), ('field_of_study', 'contains', 'paper'), ('paper', 'written_by', 'author'), ('author', 'affiliated_with', 'institution'), ('institution', 'employs', 'author'), ('author', 'writes', 'paper'), ('paper', 'cites', 'paper'), ('paper', 'written_by', 'author'), ] model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128, metapath=metapath, walk_length=64, context_size=7, walks_per_node=5, num_negative_samples=5, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 1000 == 0: # Save model every 1000 steps. save_embedding(model) save_embedding(model)
def metapath2vec(fp, PARAMS): """[function to generate metapath2vec] Args: fp ([string]): [the file path of the root of the data] PARAMS ([dict]): [the parameters of the node2vec model, KEYS:{ GRAPH_NAME: the name of the graph file EMBEDDING_DIM: dimension of embedding, WALK_LENGTH: random walk length, CONTEXT_SIZE: context size, WALKS_PER_NODE: number of walks per node, NUM_NEG_SAMPLES: number of negative samples, LEARNING_RATE: learning rate, BATCH_SIZE: batch size of each batch, NUM_EPOCH: number of epoch to be trained, CUDA: use GPU }] Returns: [np.array]: [numpy array format of the metapath2vec embedding] """ g = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME'])) user_user = from_scipy_sparse_matrix(g['U']) author_post = from_scipy_sparse_matrix(g['A']) post_user = from_scipy_sparse_matrix(g['P']) data = Data(edge_index_dict={ ('user', 'replied by', 'user'): user_user[0], ('user', 'wrote', 'post'): author_post[0], ('post', 'commented by', 'user'): post_user[0], }, num_nodes_dict={ 'post': g['post_indx'].shape[1], 'user': g['user_indx'].shape[1] }) if PARAMS['CUDA']: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = 'cpu' model = MetaPath2Vec(data.edge_index_dict, embedding_dim=PARAMS['EMBEDDING_DIM'], metapath=metapath, walk_length=PARAMS['WALK_LENGTH'], context_size=PARAMS['CONTEXT_SIZE'], walks_per_node=PARAMS['WALKS_PER_NODE'], num_negative_samples=PARAMS['NUM_NEG_SAMPLES'], sparse=True).to(device) losses = [] if not PARAMS["TEST"]: loader = model.loader(batch_size=PARAMS['BATCH_SIZE'], shuffle=True, num_workers=8) optimizer = torch.optim.SparseAdam(model.parameters(), lr=PARAMS['LEARNING_RATE']) def train(epoch, log_steps=100): model.train() total_loss = 0 store = [] i = 1 loading = iter(loader) while loading != None: try: pos_rw, neg_rw = next(loading) except IndexError: continue except StopIteration: loading = None optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() if (i + 1) % log_steps == 0: print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, ' f'Loss: {total_loss / log_steps:.4f}')) store.append(total_loss / log_steps) total_loss = 0 i += 1 return store for epoch in range(1, PARAMS['NUM_EPOCH'] + 1): losses.append(train(epoch)) model.eval() with torch.no_grad(): z = model('post').detach().cpu().numpy() if not os.path.exists(os.path.join(fp, 'processed', 'metapath2vec')): os.makedirs(os.path.join(fp, 'processed', 'metapath2vec'), exist_ok=True) with open( osp.join(fp, 'processed', 'metapath2vec', PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f: json.dump({'loss': losses}, f) np.save( osp.join(fp, 'processed', 'metapath2vec', PARAMS['EMBEDDING_NAME']), z) print('successfully saved embedding') return z