def __init__(self): d_name = "ogbl-ppa" dataset = LinkPropPredDataset(name=d_name) splitted_edge = dataset.get_edge_split() graph = dataset[0] self.num_nodes = graph["num_nodes"] self.ogb_evaluator = Evaluator(name="ogbl-ppa")
def __init__(self, path: str): ogbl_dataset = LinkPropPredDataset("ogbl-citation2", path) edge_split = ogbl_dataset.get_edge_split() super(OGBLCitation2Dataset, self).__init__([ _OGBLDatasetUtil.ogbl_data_to_general_static_graph( ogbl_dataset[0], { ('', '', ''): torch.from_numpy(ogbl_dataset[0]['edge_index']), ('', 'train_pos_edge', ''): torch.from_numpy(edge_split['train']['edge']), ('', 'val_pos_edge', ''): torch.from_numpy(edge_split['valid']['edge']), ('', 'val_neg_edge', ''): torch.from_numpy(edge_split['valid']['edge_neg']), ('', 'test_pos_edge', ''): torch.from_numpy(edge_split['test']['edge']), ('', 'test_neg_edge', ''): torch.from_numpy(edge_split['test']['edge_neg']) }, ({ 'node_feat': 'feat', 'node_year': 'year' } if _backend.DependentBackend.is_dgl() else { 'node_feat': 'x', 'node_year': 'year' })) ])
def __init__(self, graph_wrapper=None, buf_size=1000, batch_size=128, num_workers=1, shuffle=True, phase="train"): super(PPADataGenerator, self).__init__(buf_size=buf_size, num_workers=num_workers, batch_size=batch_size, shuffle=shuffle) self.d_name = "ogbl-ppa" self.graph_wrapper = graph_wrapper dataset = LinkPropPredDataset(name=self.d_name) splitted_edge = dataset.get_edge_split() self.phase = phase graph = dataset[0] edges = graph["edge_index"].T #self.graph = pgl.graph.Graph(num_nodes=graph["num_nodes"], # edges=edges, # node_feat={"nfeat": graph["node_feat"], # "node_id": np.arange(0, graph["num_nodes"], dtype="int64").reshape(-1, 1) }) #self.graph.indegree() self.num_nodes = graph["num_nodes"] if self.phase == 'train': edges = splitted_edge["train"]["edge"] labels = np.ones(len(edges)) elif self.phase == "valid": # Compute the embedding for all the nodes pos_edges = splitted_edge["valid"]["edge"] neg_edges = splitted_edge["valid"]["edge_neg"] pos_labels = np.ones(len(pos_edges)) neg_labels = np.zeros(len(neg_edges)) edges = np.vstack([pos_edges, neg_edges]) labels = pos_labels.tolist() + neg_labels.tolist() elif self.phase == "test": # Compute the embedding for all the nodes pos_edges = splitted_edge["test"]["edge"] neg_edges = splitted_edge["test"]["edge_neg"] pos_labels = np.ones(len(pos_edges)) neg_labels = np.zeros(len(neg_edges)) edges = np.vstack([pos_edges, neg_edges]) labels = pos_labels.tolist() + neg_labels.tolist() self.line_examples = [] Example = namedtuple('Example', ['src', "dst", "label"]) for edge, label in zip(edges, labels): self.line_examples.append( Example(src=edge[0], dst=edge[1], label=label)) print("Phase", self.phase) print("Len Examples", len(self.line_examples))
def _load(self) -> None: try: from ogb.linkproppred import LinkPropPredDataset except ImportError as e: raise ModuleNotFoundError( f'Need to `pip install ogb` to use pykeen.datasets.{self.__class__.__name__}.', ) from e dataset = LinkPropPredDataset(name=self.name, root=self.cache_root) edge_split = dataset.get_edge_split() self._training = self._make_tf(edge_split["train"]) self._testing = self._make_tf( edge_split["test"], entity_to_id=self._training.entity_to_id, relation_to_id=self._training.relation_to_id, ) self._validation = self._make_tf( edge_split["valid"], entity_to_id=self._training.entity_to_id, relation_to_id=self._training.relation_to_id, )
def __init__(self, path: str): ogbl_dataset = LinkPropPredDataset("ogbl-ddi", path) edge_split = ogbl_dataset.get_edge_split() super(OGBLDDIDataset, self).__init__([ GeneralStaticGraphGenerator.create_heterogeneous_static_graph( {'': { '_NID': torch.arange(ogbl_dataset[0]['num_nodes']) }}, { ('', '', ''): torch.from_numpy(ogbl_dataset[0]['edge_index']), ('', 'train_pos_edge', ''): torch.from_numpy(edge_split['train']['edge']), ('', 'val_pos_edge', ''): torch.from_numpy(edge_split['valid']['edge']), ('', 'val_neg_edge', ''): torch.from_numpy(edge_split['valid']['edge_neg']), ('', 'test_pos_edge', ''): torch.from_numpy(edge_split['test']['edge']), ('', 'test_neg_edge', ''): torch.from_numpy(edge_split['test']['edge_neg']) }) ])
def main(_): ds = LinkPropPredDataset(FLAGS.dataset) split_edge = ds.get_edge_split() train_edges = split_edge['train']['edge'] train_edges = np.concatenate([train_edges, train_edges[:, ::-1]], axis=0) spa = scipy.sparse.csr_matrix( (np.ones([len(train_edges)]), (train_edges[:, 0], train_edges[:, 1]))) mult_f = tf_fsvd.WYSDeepWalkPF(spa, window=FLAGS.wys_window, mult_degrees=False, neg_sample_coef=FLAGS.wys_neg_coef) tt = tqdm.tqdm(range(FLAGS.num_runs)) test_metrics = [] val_metrics = [] for run in tt: u, s, v = tf_fsvd.fsvd(mult_f, FLAGS.k, n_iter=FLAGS.svd_iters, n_redundancy=FLAGS.k * 3) dataset = LinkPropPredDataset(FLAGS.dataset) evaluator = Evaluator(name=FLAGS.dataset) evaluator.K = FLAGS.hits split_edge = dataset.get_edge_split() metrics = [] for split in ('test', 'valid'): pos_edges = split_edge[split]['edge'] neg_edges = split_edge[split]['edge_neg'] pos_scores = tf.reduce_sum(tf.gather(u * s, pos_edges[:, 0]) * tf.gather(v, pos_edges[:, 1]), axis=1).numpy() neg_scores = tf.reduce_sum(tf.gather(u * s, neg_edges[:, 0]) * tf.gather(v, neg_edges[:, 1]), axis=1).numpy() metric = evaluator.eval({ 'y_pred_pos': pos_scores, 'y_pred_neg': neg_scores }) metrics.append(metric['hits@%i' % FLAGS.hits]) test_metrics.append(metrics[0]) val_metrics.append(metrics[1]) tt.set_description( 'HITS@%i: validate=%g; test=%g' % (FLAGS.hits, np.mean(val_metrics), np.mean(test_metrics))) print('\n\n *** Trained for %i times and average metrics are:') print('HITS@20 test: mean=%g; std=%g' % (np.mean(test_metrics), np.std(test_metrics))) print('HITS@20 validate: mean=%g; std=%g' % (np.mean(val_metrics), np.std(val_metrics)))
def _add_eig(self, norm='none', number=6): dataset = LinkPropPredDataset(name='ogbl-collab') graph = dataset[0] G = nx.Graph() G.add_nodes_from([i for i in range(235868)]) for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]): G.add_edge(nod1, nod2) components = list(nx.connected_components(G)) list_G = [] list_nodes = [] for component in components: G_new = nx.Graph() G_new.add_nodes_from(list(component)) list_G.append(G_new) list_nodes.append(list(component)) for i in range(len(list_G)): for nod1, nod2 in list(G.edges(list_nodes[i])): list_G[i].add_edge(nod1, nod2) EigVec_global = np.ones((235868, number)) for g in list_G: node_list = list(g.nodes) A = nx.adjacency_matrix(g, nodelist=node_list).astype(float) if norm == 'none': D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D - A elif norm == 'sym': D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) * D_norm elif norm == 'walk': D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree()))) D = sp.diags(list(map(lambda x: x[1], g.degree()))) L = D_norm * (D - A) if len(node_list) > 2: EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0) EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0) EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :] elif len(node_list) == 2: EigVec_global[node_list[0], :number] = np.zeros((1, number)) self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float() print(sorted(self.graph.ndata['eig'][1]))
def main(args): if (not args.do_train) and (not args.do_valid) and (not args.do_test) and ( not args.evaluate_train): raise ValueError('one of train/val/test mode must be choosed.') if args.init_checkpoint: override_config(args) args.save_path = 'log/%s/%s/%s-%s/%s' % ( args.dataset, args.model, args.hidden_dim, args.gamma, time.time()) if args.save_path == None else args.save_path writer = SummaryWriter(args.save_path) # Write logs to checkpoint and console set_logger(args) dataset = LinkPropPredDataset(name='ogbl-biokg') split_edge = dataset.get_edge_split() train_triples, valid_triples, test_triples = split_edge[ "train"], split_edge["valid"], split_edge["test"] nrelation = int(max(train_triples['relation'])) + 1 entity_dict = dict() cur_idx = 0 for key in dataset[0]['num_nodes_dict']: entity_dict[key] = (cur_idx, cur_idx + dataset[0]['num_nodes_dict'][key]) cur_idx += dataset[0]['num_nodes_dict'][key] nentity = sum(dataset[0]['num_nodes_dict'].values()) evaluator = Evaluator(name=args.dataset) args.nentity = nentity args.nrelation = nrelation logging.info('Model: %s' % args.model) logging.info('Dataset: %s' % args.dataset) logging.info('#entity: %d' % nentity) logging.info('#relation: %d' % nrelation) # train_triples = split_dict['train'] logging.info('#train: %d' % len(train_triples['head'])) # valid_triples = split_dict['valid'] logging.info('#valid: %d' % len(valid_triples['head'])) # test_triples = split_dict['test'] logging.info('#test: %d' % len(test_triples['head'])) train_count, train_true_head, train_true_tail = defaultdict( lambda: 4), defaultdict(list), defaultdict(list) for i in tqdm(range(len(train_triples['head']))): head, relation, tail = train_triples['head'][i], train_triples[ 'relation'][i], train_triples['tail'][i] head_type, tail_type = train_triples['head_type'][i], train_triples[ 'tail_type'][i] train_count[(head, relation, head_type)] += 1 train_count[(tail, -relation - 1, tail_type)] += 1 train_true_head[(relation, tail)].append(head) train_true_tail[(head, relation)].append(tail) kge_model = KGEModel( model_name=args.model, nentity=nentity, nrelation=nrelation, hidden_dim=args.hidden_dim, gamma=args.gamma, double_entity_embedding=args.double_entity_embedding, double_relation_embedding=args.double_relation_embedding, evaluator=evaluator) logging.info('Model Parameter Configuration:') for name, param in kge_model.named_parameters(): logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad))) if args.cuda: kge_model = kge_model.cuda() if args.init_checkpoint: # Restore model from checkpoint directory logging.info('Loading checkpoint %s...' % args.init_checkpoint) checkpoint = torch.load( os.path.join(args.init_checkpoint, 'checkpoint')) entity_dict = checkpoint['entity_dict'] if args.do_train: # Set training dataloader iterator train_dataloader_head = DataLoader( TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch', train_count, train_true_head, train_true_tail, entity_dict), batch_size=args.batch_size, shuffle=True, num_workers=max(1, args.cpu_num // 2), collate_fn=TrainDataset.collate_fn) train_dataloader_tail = DataLoader( TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'tail-batch', train_count, train_true_head, train_true_tail, entity_dict), batch_size=args.batch_size, shuffle=True, num_workers=max(1, args.cpu_num // 2), collate_fn=TrainDataset.collate_fn) train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail) # Set training configuration current_learning_rate = args.learning_rate optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, kge_model.parameters()), lr=current_learning_rate) if args.warm_up_steps: warm_up_steps = args.warm_up_steps else: warm_up_steps = args.max_steps // 2 if args.init_checkpoint: # Restore model from checkpoint directory # logging.info('Loading checkpoint %s...' % args.init_checkpoint) # checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint')) init_step = checkpoint['step'] kge_model.load_state_dict(checkpoint['model_state_dict']) # entity_dict = checkpoint['entity_dict'] if args.do_train: current_learning_rate = checkpoint['current_learning_rate'] warm_up_steps = checkpoint['warm_up_steps'] optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: logging.info('Ramdomly Initializing %s Model...' % args.model) init_step = 0 step = init_step logging.info('Start Training...') logging.info('init_step = %d' % init_step) logging.info('batch_size = %d' % args.batch_size) logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling) logging.info('hidden_dim = %d' % args.hidden_dim) logging.info('gamma = %f' % args.gamma) logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling)) if args.negative_adversarial_sampling: logging.info('adversarial_temperature = %f' % args.adversarial_temperature) # Set valid dataloader as it would be evaluated during training if args.do_train: logging.info('learning_rate = %d' % current_learning_rate) training_logs = [] #Training Loop for step in range(init_step, args.max_steps): log = kge_model.train_step(kge_model, optimizer, train_iterator, args) training_logs.append(log) if step >= warm_up_steps: current_learning_rate = current_learning_rate / 10 logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step)) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, kge_model.parameters()), lr=current_learning_rate) warm_up_steps = warm_up_steps * 3 if step % args.save_checkpoint_steps == 0 and step > 0: # ~ 41 seconds/saving save_variable_list = { 'step': step, 'current_learning_rate': current_learning_rate, 'warm_up_steps': warm_up_steps, 'entity_dict': entity_dict } save_model(kge_model, optimizer, save_variable_list, args) if step % args.log_steps == 0: metrics = {} for metric in training_logs[0].keys(): metrics[metric] = sum( [log[metric] for log in training_logs]) / len(training_logs) log_metrics('Train', step, metrics, writer) training_logs = [] if args.do_valid and step % args.valid_steps == 0 and step > 0: logging.info('Evaluating on Valid Dataset...') metrics = kge_model.test_step(kge_model, valid_triples, args, entity_dict) log_metrics('Valid', step, metrics, writer) save_variable_list = { 'step': step, 'current_learning_rate': current_learning_rate, 'warm_up_steps': warm_up_steps } save_model(kge_model, optimizer, save_variable_list, args) if args.do_valid: logging.info('Evaluating on Valid Dataset...') metrics = kge_model.test_step(kge_model, valid_triples, args, entity_dict) log_metrics('Valid', step, metrics, writer) if args.do_test: logging.info('Evaluating on Test Dataset...') metrics = kge_model.test_step(kge_model, test_triples, args, entity_dict) log_metrics('Test', step, metrics, writer) if args.evaluate_train: logging.info('Evaluating on Training Dataset...') small_train_triples = {} indices = np.random.choice(len(train_triples['head']), args.ntriples_eval_train, replace=False) for i in train_triples: if 'type' in i: small_train_triples[i] = [train_triples[i][x] for x in indices] else: small_train_triples[i] = train_triples[i][indices] metrics = kge_model.test_step(kge_model, small_train_triples, args, entity_dict, random_sampling=True) log_metrics('Train', step, metrics, writer)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= """ogbl_collab dataset. """ import os import numpy as np from ogb.linkproppred import LinkPropPredDataset # load data dataset = LinkPropPredDataset(name='ogbl-collab') split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge['train'], split_edge[ 'valid'], split_edge['test'] # train_edge['edge'], (1179052, 2) # train_edge['weight'], (1179052,) # train_edge['year'], (1179052,) # valid_edge, 60084 # test_edge, 46329 graph = dataset[0] num_nodes = graph['num_nodes'] # 235868 node_feat = graph['node_feat'] # shape(235868, 128) # dump to disk root = 'ogbl_collab/'
from collections import defaultdict import time import pdb import datetime def now(): d = datetime.datetime.now() x = d - datetime.timedelta(microseconds=d.microsecond) return x d_name = "ogbl-biokg" dataset = LinkPropPredDataset(name=d_name) split_edge = dataset.get_edge_split() train_triples, valid_triples, test_triples = split_edge["train"], split_edge[ "valid"], split_edge["test"] nrelation = int(max(train_triples['relation'])) + 1 #4 nentity = sum(dataset[0]['num_nodes_dict'].values()) entity_dict = dict() cur_idx = 0 for key in dataset[0][ 'num_nodes_dict']: #['drug', 'sideeffect', 'protein', 'disease', 'function']: entity_dict[key] = (cur_idx, cur_idx + dataset[0]['num_nodes_dict'][key]) cur_idx += dataset[0]['num_nodes_dict'][key] nentity = sum(
def main(args): if ( (not args.do_train) and (not args.do_valid) and (not args.do_test) and (not args.evaluate_train) ): raise ValueError("one of train/val/test mode must be choosed.") if args.init_checkpoint: override_config(args) args.save_path = ( "log/%s/%s/%s-%s/%s" % (args.dataset, args.model, args.hidden_dim, args.gamma, time.time()) if args.save_path == None else args.save_path ) writer = SummaryWriter(args.save_path) # Write logs to checkpoint and console set_logger(args) dataset = LinkPropPredDataset(name=args.dataset) split_dict = dataset.get_edge_split() nentity = dataset.graph["num_nodes"] nrelation = int(max(dataset.graph["edge_reltype"])[0]) + 1 evaluator = Evaluator(name=args.dataset) args.nentity = nentity args.nrelation = nrelation logging.info("Model: %s" % args.model) logging.info("Dataset: %s" % args.dataset) logging.info("#entity: %d" % nentity) logging.info("#relation: %d" % nrelation) train_triples = split_dict["train"] logging.info("#train: %d" % len(train_triples["head"])) valid_triples = split_dict["valid"] logging.info("#valid: %d" % len(valid_triples["head"])) test_triples = split_dict["test"] logging.info("#test: %d" % len(test_triples["head"])) train_count, train_true_head, train_true_tail = ( defaultdict(lambda: 4), defaultdict(list), defaultdict(list), ) for i in tqdm(range(len(train_triples["head"]))): head, relation, tail = ( train_triples["head"][i], train_triples["relation"][i], train_triples["tail"][i], ) train_count[(head, relation)] += 1 train_count[(tail, -relation - 1)] += 1 train_true_head[(relation, tail)].append(head) train_true_tail[(head, relation)].append(tail) kge_model = KGEModel( model_name=args.model, nentity=nentity, nrelation=nrelation, hidden_dim=args.hidden_dim, gamma=args.gamma, double_entity_embedding=args.double_entity_embedding, double_relation_embedding=args.double_relation_embedding, evaluator=evaluator, ) logging.info("Model Parameter Configuration:") for name, param in kge_model.named_parameters(): logging.info( "Parameter %s: %s, require_grad = %s" % (name, str(param.size()), str(param.requires_grad)) ) if args.cuda: kge_model = kge_model.cuda() if args.do_train: # Set training dataloader iterator train_dataloader_head = DataLoader( TrainDataset( train_triples, nentity, nrelation, args.negative_sample_size, "head-batch", train_count, train_true_head, train_true_tail, ), batch_size=args.batch_size, shuffle=True, num_workers=max(1, args.cpu_num // 2), collate_fn=TrainDataset.collate_fn, ) train_dataloader_tail = DataLoader( TrainDataset( train_triples, nentity, nrelation, args.negative_sample_size, "tail-batch", train_count, train_true_head, train_true_tail, ), batch_size=args.batch_size, shuffle=True, num_workers=max(1, args.cpu_num // 2), collate_fn=TrainDataset.collate_fn, ) train_iterator = BidirectionalOneShotIterator( train_dataloader_head, train_dataloader_tail ) # Set training configuration current_learning_rate = args.learning_rate optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, kge_model.parameters()), lr=current_learning_rate, ) if args.warm_up_steps: warm_up_steps = args.warm_up_steps else: warm_up_steps = args.max_steps // 2 if args.init_checkpoint: # Restore model from checkpoint directory logging.info("Loading checkpoint %s..." % args.init_checkpoint) checkpoint = torch.load(os.path.join(args.init_checkpoint, "checkpoint")) init_step = checkpoint["step"] kge_model.load_state_dict(checkpoint["model_state_dict"]) if args.do_train: current_learning_rate = checkpoint["current_learning_rate"] warm_up_steps = checkpoint["warm_up_steps"] optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: logging.info("Ramdomly Initializing %s Model..." % args.model) init_step = 0 step = init_step logging.info("Start Training...") logging.info("init_step = %d" % init_step) logging.info("batch_size = %d" % args.batch_size) logging.info( "negative_adversarial_sampling = %d" % args.negative_adversarial_sampling ) logging.info("hidden_dim = %d" % args.hidden_dim) logging.info("gamma = %f" % args.gamma) logging.info( "negative_adversarial_sampling = %s" % str(args.negative_adversarial_sampling) ) if args.negative_adversarial_sampling: logging.info("adversarial_temperature = %f" % args.adversarial_temperature) # Set valid dataloader as it would be evaluated during training if args.do_train: logging.info("learning_rate = %d" % current_learning_rate) training_logs = [] # Training Loop for step in range(init_step, args.max_steps): log = kge_model.train_step(kge_model, optimizer, train_iterator, args) training_logs.append(log) if step >= warm_up_steps: current_learning_rate = current_learning_rate / 10 logging.info( "Change learning_rate to %f at step %d" % (current_learning_rate, step) ) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, kge_model.parameters()), lr=current_learning_rate, ) warm_up_steps = warm_up_steps * 3 if ( step % args.save_checkpoint_steps == 0 and step > 0 ): # ~ 41 seconds/saving save_variable_list = { "step": step, "current_learning_rate": current_learning_rate, "warm_up_steps": warm_up_steps, } save_model(kge_model, optimizer, save_variable_list, args) if step % args.log_steps == 0: metrics = {} for metric in training_logs[0].keys(): metrics[metric] = sum([log[metric] for log in training_logs]) / len( training_logs ) log_metrics("Train", step, metrics, writer) training_logs = [] if args.do_valid and step % args.valid_steps == 0 and step > 0: logging.info("Evaluating on Valid Dataset...") metrics = kge_model.test_step(kge_model, valid_triples, args) log_metrics("Valid", step, metrics, writer) save_variable_list = { "step": step, "current_learning_rate": current_learning_rate, "warm_up_steps": warm_up_steps, } save_model(kge_model, optimizer, save_variable_list, args) if args.do_valid: logging.info("Evaluating on Valid Dataset...") metrics = kge_model.test_step(kge_model, valid_triples, args) log_metrics("Valid", step, metrics, writer) if args.do_test: logging.info("Evaluating on Test Dataset...") metrics = kge_model.test_step(kge_model, test_triples, args) log_metrics("Test", step, metrics, writer) if args.evaluate_train: logging.info("Evaluating on Training Dataset...") small_train_triples = {} indices = np.random.choice( len(train_triples["head"]), args.ntriples_eval_train, replace=False ) for i in train_triples: small_train_triples[i] = train_triples[i][indices] metrics = kge_model.test_step( kge_model, small_train_triples, args, random_sampling=True ) log_metrics("Train", step, metrics, writer)
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI (MADGraph)') parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--batch_size', type=int, default=4 * 1024) parser.add_argument('--dim', type=int, default=12) parser.add_argument('--heads', type=int, default=12) parser.add_argument('--samples', type=int, default=8) parser.add_argument('--nearest', type=int, default=8) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--sentinels', type=int, default=8) parser.add_argument('--memory', type=str, default='all') parser.add_argument('--softmin', type=bool, default=True) parser.add_argument('--output_csv', type=str, default='') args = parser.parse_args() print(args) DNAME = 'ogbl-ddi' dataset = LinkPropPredDataset(name=DNAME) graph = dataset[0] n_nodes = graph['num_nodes'] data = dataset.get_edge_split() for group in 'train valid test'.split(): if group in data: sets = data[group] for key in ('edge', 'edge_neg'): if key in sets: sets[key] = gpu(torch.from_numpy(sets[key])) data['eval_train'] = { 'edge': data['train']['edge'][torch.randperm( data['train']['edge'].shape[0])[:data['valid']['edge'].shape[0]]] } model = MADGraph( n_nodes=n_nodes, node_feats=args.dim, src=data['train']['edge'][:, 0], dst=data['train']['edge'][:, 1], n_samples=args.samples, n_heads=args.heads, n_sentinels=args.sentinels, memory=['none', 'stat', 'all'].index(args.memory), softmin=args.softmin, n_nearest=args.nearest, ) params = [p for net in [model] for p in net.parameters()] print('params:', sum(p.numel() for p in params)) evaluator = Evaluator(name=DNAME) loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): torch.manual_seed(args.seed + run) opt = optim.Adam(params, lr=args.lr) torch.nn.init.xavier_uniform_(model.pos.data) torch.nn.init.xavier_uniform_(model.field.data) model.uncertainty.data = model.uncertainty.data * 0 + 1 for epoch in range(1, args.epochs + 1): model.train() for chunk in sample(data['train']['edge'], args.batch_size): opt.zero_grad() p_edge = torch.sigmoid(model(chunk)) edge_neg_chunk = gpu(torch.randint(0, n_nodes, chunk.shape)) p_edge_neg = torch.sigmoid(model(edge_neg_chunk)) loss = (-torch.log(1e-5 + 1 - p_edge_neg).mean() - torch.log(1e-5 + p_edge).mean()) loss.backward() opt.step() if epoch % args.eval_steps: continue with torch.no_grad(): model.eval() p_train = torch.cat([ model(chunk) for chunk in sample( data['eval_train']['edge'], args.batch_size) ]) n_train = torch.cat([ model(chunk) for chunk in sample(data['valid']['edge_neg'], args.batch_size) ]) p_valid = torch.cat([ model(chunk) for chunk in sample(data['valid']['edge'], args.batch_size) ]) n_valid = n_train p_test = torch.cat([ model(chunk) for chunk in sample(data['test']['edge'], args.batch_size) ]) n_test = torch.cat([ model(chunk) for chunk in sample(data['test']['edge_neg'], args.batch_size) ]) for K in [10, 20, 30]: evaluator.K = K key = f'Hits@{K}' h_train = evaluator.eval({ 'y_pred_pos': p_train, 'y_pred_neg': n_train, })[f'hits@{K}'] h_valid = evaluator.eval({ 'y_pred_pos': p_valid, 'y_pred_neg': n_valid, })[f'hits@{K}'] h_test = evaluator.eval({ 'y_pred_pos': p_test, 'y_pred_neg': n_test, })[f'hits@{K}'] loggers[key].add_result(run, (h_train, h_valid, h_test)) print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * h_train:.2f}%, ' f'Valid: {100 * h_valid:.2f}%, ' f'Test: {100 * h_test:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def test_datasetsaver(): # test on graph classification # ogbg-molhiv test_task = 'link' # testing all the dataset objects are working. if test_task == 'graph': from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = GraphPropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'node': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-arxiv' # test ogbn-proteins dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'link': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-collab' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() elif test_task == 'heteronode': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-mag' dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'heterolink': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-biokg' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() else: raise ValueError('Invalid task category') print(dataset[0]) if 'link' in test_task: print(dataset.get_edge_split()) else: print(dataset.get_idx_split()) if 'graph' in test_task: graph_list = dataset.graphs else: graph_list = [dataset.graph] if 'link' not in test_task: labels = dataset.labels is_hetero = 'hetero' in test_task version = 2 if dataset_name == 'ogbn-mag' else 1 saver = DatasetSaver(dataset_name, is_hetero, version=version) # saving graph objects saver.save_graph_list(graph_list) # saving target labels if 'link' not in test_task: saver.save_target_labels(labels) # saving split if 'link' in test_task: split_idx = dataset.get_edge_split() else: split_idx = dataset.get_idx_split() # second argument must be the name of the split saver.save_split(split_idx, dataset.meta_info['split']) # copying mapping dir # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/") saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join( dataset_name.split('-')))) saver.save_task_info( dataset.task_type, dataset.eval_metric, dataset.num_classes if hasattr(dataset, 'num_classes') else None) meta_dict = saver.get_meta_dict() print(meta_dict) print('Now testing.') if 'graph' in test_task: print('library agnostic') dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'node' in test_task: print('library agnostic') dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'link' in test_task: print('library agnostic') dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('Pytorch Geometric') dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('DGL') dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) else: raise ValueError('Invalid task category') # zip saver.zip() print('Finished zipping!') saver.cleanup()
def main(): parser = ArgumentParser(description="ne") parser.add_argument("-d", "--dataset", type=str, default="cora", \ help="input dataset") parser.add_argument("-o", "--coarse", type=str, default="simple", \ help="choose either simple_coarse or lamg_coarse, [simple, lamg]") parser.add_argument("-c", "--mcr_dir", type=str, default="/opt/matlab/R2018A/", \ help="directory of matlab compiler runtime (only required by lamg_coarsen)") parser.add_argument("-s", "--search_ratio", type=int, default=12, \ help="control the search space in graph fusion process (only required by lamg_coarsen)") parser.add_argument("-r", "--reduce_ratio", type=int, default=2, \ help="control graph coarsening levels (only required by lamg_coarsen)") parser.add_argument("-v", "--level", type=int, default=1, \ help="number of coarsening levels (only required by simple_coarsen)") parser.add_argument("-n", "--num_neighs", type=int, default=2, \ help="control k-nearest neighbors in graph fusion process") parser.add_argument("-l", "--lda", type=float, default=0.1, \ help="control self loop in adjacency matrix") parser.add_argument("-e", "--embed_path", type=str, default="embed_results/embeddings_palone_deepwalk.npy", \ help="path of embedding result") parser.add_argument("-m", "--embed_method", type=str, default="deepwalk", \ help="[deepwalk, node2vec, graphsage]") parser.add_argument("-f", "--fusion", default=True, action="store_false", \ help="whether use graph fusion") parser.add_argument("-p", "--power", default=False, action="store_true", \ help="Strong power of graph filter, set True to enhance filter power") parser.add_argument("-g", "--sage_model", type=str, default="mean", \ help="aggregation function in graphsage") parser.add_argument("-w", "--sage_weighted", default=True, action="store_false", \ help="whether consider weighted reduced graph") args = parser.parse_args() dataset = args.dataset feature_path = "dataset/{}/{}-feats.npy".format(dataset, dataset) fusion_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) reduce_results = "reduction_results/" mapping_path = "{}Mapping.mtx".format(reduce_results) if args.fusion: coarsen_input_path = "dataset/{}/fused_{}.mtx".format(dataset, dataset) else: coarsen_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) ######Load Data###### print("%%%%%% Loading Graph Data %%%%%%") if args.dataset == "ogb": d_name = "ogbl-ppa" from ogb.linkproppred import LinkPropPredDataset dataset = LinkPropPredDataset(name=d_name) print(dataset) print(dataset[0]) split_edge = dataset.get_edge_split() print(split_edge) # train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph = dataset[0] # graph: library-agnostic graph object print(graph['edge_index'].shape) print(graph['edge_feat']) print(graph['node_feat']) # print((np.array(graph['node_feat']) == 0.0).all()) graph['directed'] = False print(graph) graph_nodes = [i for i in range(0, graph['num_nodes'])] G = nx.Graph() G.add_nodes_from(graph_nodes) G.add_edges_from(graph['edge_index'].T) # nx.draw(G, with_labels=True) print(G.nodes) # plt.show() laplacian = laplacian_matrix(G) print(laplacian) else: path = "dataset/ppi/ppi.mtx" G = mtx2graph(path) laplacian, edges = json2mtx(dataset) ## whether node features are required if args.fusion or args.embed_method == "graphsage": if args.dataset == 'ogb': feature = graph['node_feat'] else: feature = np.load(feature_path) # print(feature[1][0]) ######Embed Reduced Graph###### print("%%%%%% Starting Graph Embedding %%%%%%") if args.embed_method == "deepwalk": embed_start = time.process_time() embeddings = deepwalk(G) elif args.embed_method == "node2vec": embed_start = time.process_time() embeddings = node2vec(G) elif args.embed_method == "graphsage": from embed_methods.graphsage.graphsage import graphsage nx.set_node_attributes(G, False, "test") nx.set_node_attributes(G, False, "val") ## obtain mapping operator if args.coarse == "lamg": mapping = normalize(mtx2matrix(mapping_path), norm='l1', axis=1) else: mapping = identity(feature.shape[0]) for p in projections: mapping = mapping @ p mapping = normalize(mapping, norm='l1', axis=1).transpose() ## control iterations for training coarse_ratio = mapping.shape[1] / mapping.shape[0] ## map node feats to the coarse graph feats = mapping @ feature embed_start = time.process_time() embeddings = graphsage(G, feats, args.sage_model, args.sage_weighted, int(1000 / coarse_ratio)) embed_time = time.process_time() - embed_start ######Save Embeddings###### np.save(args.embed_path, embeddings) ######Evaluation###### print("%%%%%% Starting Evaluation %%%%%%") # link prediction embeds = np.load(args.embed_path) ''' if args.dataset == "ogb": acc, pre, sen, mcc, auc = linkprediction_ogb(split_edge, embeds) else: acc, pre, sen, mcc, auc = linkprediction(edges, embeds, dataset)''' print("Running regression..") # node prediction # run_regression(np.array(train_embeds), np.array(train_labels), np.array(test_embeds), np.array(test_labels)) # lr("dataset/{}/".format(dataset), args.embed_path, dataset) ######Report timing information######å print("%%%%%% CPU time %%%%%%") if args.fusion: total_time = embed_time print(f"Graph Fusion Time:") else: total_time = embed_time print("Graph Fusion Time: 0") print(f"Graph Embedding Time: {embed_time:.3f}") print(f"Total Time = Embedding_time = {total_time:.3f}")
return None map = dict() with open(file, newline='') as csvfile: csvfile.readline() for (idx, name) in csv.reader(csvfile, delimiter=',', quotechar='|'): map[name] = int(idx) return map args = parse_args() dataset_name = args.dataset if args.do_test: meta = 'dataset_' + re.sub('-', '_', args.dataset) + '/meta_dict.pt' meta_dict = load(meta) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dsplit = dataset.get_edge_split() if args.print_relations: np.set_printoptions(threshold=np.inf) print('test.relations <- c(') print( re.sub('[\[\]]', '', np.array2string(dsplit['test']['relation'], separator=', '))) print(')') elif args.select_head >= 0 or args.select_tail >= 0: for k in dsplit.keys(): for i in range(len(dsplit[k]['head'])): (h, t, r) = (dsplit[k]['head'][i], dsplit[k]['tail'][i], dsplit[k]['relation'][i]) if args.select_head < 0 or args.select_head == h: