def load_data_flat(prefix, use_node_attrs=True, use_edge_data=True, node_val='one'): ''' Load data into one matrix with all relations, reproducing Maron 2019 The first [# relation types] channels are adjacency matrices, while the next [sum of feature dimensions per entity type] channels have node attributes on the relevant segment of their diagonals if use_node_attrs=True. If node features aren't included, then ndoe_val is used instead. ''' dl = data_loader(DATA_FILE_DIR + prefix) total_n_nodes = dl.nodes['total'] entities = [Entity(0, total_n_nodes)] relations = {0: Relation(0, [entities[0], entities[0]])} schema = DataSchema(entities, relations) # Sparse Matrix containing all data data_full = sum(dl.links['data'].values()).tocoo() data_diag = scipy.sparse.coo_matrix( (np.ones(total_n_nodes), (np.arange(total_n_nodes), np.arange(total_n_nodes))), (total_n_nodes, total_n_nodes)) data_full += data_diag data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_() data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0) # Load up all edge data for rel_id in sorted(dl.links['data'].keys()): data_matrix = dl.links['data'][rel_id] data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data_rel.values = torch.ones(data_rel.values.shape) data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full, 1) + data_rel data_out.values = torch.cat([data_out.values, data_rel_full.values], 1) data_out.n_channels += 1 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): start_i = dl.nodes['shift'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) n_channels = attr_matrix.shape[1] indices = torch.arange(start_i, start_i + n_instances).unsqueeze(0).repeat( 2, 1) data_rel = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([total_n_nodes, total_n_nodes, n_channels]), is_set=True) data_rel_full = SparseMatrix.from_other_sparse_matrix( data_full, n_channels) + data_rel data_out.values = torch.cat( [data_out.values, data_rel_full.values], 1) data_out.n_channels += n_channels data = SparseMatrixData(schema) data[0] = data_out return schema,\ data, \ dl
def load_data(prefix, use_node_attrs=True, use_edge_data=True, use_other_edges=True, node_val='one'): dl = data_loader(DATA_FILE_DIR + prefix) all_entities = [ Entity(entity_id, n_instances) for entity_id, n_instances in sorted(dl.nodes['count'].items()) ] relations = {} test_types = dl.test_types if use_other_edges: for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()): relations[rel_id] = Relation( rel_id, [all_entities[entity_i], all_entities[entity_j]]) else: for rel_id in test_types: entity_i, entity_j = dl.links['meta'][rel_id] relations[rel_id] = Relation( rel_id, [all_entities[entity_i], all_entities[entity_j]]) if use_other_edges: entities = all_entities else: entities = list(np.unique(relations[test_types[0]].entities)) max_relation = max(relations) + 1 if use_node_attrs: # Create fake relations to represent node attributes for entity in entities: rel_id = max_relation + entity.id relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True) schema = DataSchema(entities, relations) data = SparseMatrixData(schema) for rel_id, data_matrix in dl.links['data'].items(): if use_other_edges or rel_id in test_types: # Get subset belonging to entities in relation relation = relations[rel_id] start_i = dl.nodes['shift'][relation.entities[0].id] end_i = start_i + dl.nodes['count'][relation.entities[0].id] start_j = dl.nodes['shift'][relation.entities[1].id] end_j = start_j + dl.nodes['count'][relation.entities[1].id] rel_matrix = data_matrix[start_i:end_i, start_j:end_j] data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data[rel_id].values = torch.ones(data[rel_id].values.shape) if use_node_attrs: for ent in entities: ent_id = ent.id attr_matrix = dl.nodes['attr'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) n_channels = attr_matrix.shape[1] rel_id = ent_id + max_relation indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1) data[rel_id] = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([n_instances, n_instances, n_channels]), is_set=True) return schema,\ data, \ dl
def load_data(prefix='DBLP', use_node_attrs=True, use_edge_data=True, feats_type=0): dl = data_loader(DATA_FILE_DIR + prefix) # Create Schema entities = [ Entity(entity_id, n_instances) for entity_id, n_instances in sorted(dl.nodes['count'].items()) ] relations = { rel_id: Relation(rel_id, [entities[entity_i], entities[entity_j]]) for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()) } num_relations = len(relations) if use_node_attrs: # Create fake relations to represent node attributes for entity in entities: rel_id = num_relations + entity.id relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True) schema = DataSchema(entities, relations) # Collect data data = SparseMatrixData(schema) for rel_id, data_matrix in dl.links['data'].items(): # Get subset belonging to entities in relation start_i = dl.nodes['shift'][relations[rel_id].entities[0].id] end_i = start_i + dl.nodes['count'][relations[rel_id].entities[0].id] start_j = dl.nodes['shift'][relations[rel_id].entities[1].id] end_j = start_j + dl.nodes['count'][relations[rel_id].entities[1].id] rel_matrix = data_matrix[start_i:end_i, start_j:end_j] data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data[rel_id].values = torch.ones(data[rel_id].values.shape) target_entity = 0 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): if attr_matrix is None: # Attribute for each node is a single 1 attr_matrix = np.ones(dl.nodes['count'][ent_id])[:, None] n_channels = attr_matrix.shape[1] rel_id = ent_id + num_relations n_instances = dl.nodes['count'][ent_id] indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1) data[rel_id] = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([n_instances, n_instances, n_channels]), is_set=True) n_outputs = dl.nodes['count'][target_entity] n_output_classes = dl.labels_train['num_classes'] schema_out = DataSchema([entities[target_entity]], [ Relation(0, [entities[target_entity], entities[target_entity]], is_set=True) ]) data_target = SparseMatrixData(schema_out) data_target[0] = SparseMatrix( indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_outputs, n_output_classes]), shape=(n_outputs, n_outputs, n_output_classes), is_set=True) labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']), dtype=int) val_ratio = 0.2 train_idx = np.nonzero(dl.labels_train['mask'])[0] np.random.shuffle(train_idx) split = int(train_idx.shape[0] * val_ratio) val_idx = train_idx[:split] train_idx = train_idx[split:] train_idx = np.sort(train_idx) val_idx = np.sort(val_idx) test_idx = np.nonzero(dl.labels_test['mask'])[0] labels[train_idx] = dl.labels_train['data'][train_idx] labels[val_idx] = dl.labels_train['data'][val_idx] if prefix != 'IMDB': labels = labels.argmax(axis=1) train_val_test_idx = {} train_val_test_idx['train_idx'] = train_idx train_val_test_idx['val_idx'] = val_idx train_val_test_idx['test_idx'] = test_idx return schema,\ schema_out, \ data, \ data_target, \ labels,\ train_val_test_idx,\ dl
rel_movie_actor = Relation(0, [ent_movie, ent_actor]) rel_movie_director = Relation(1, [ent_movie, ent_director]) rel_movie_keyword = Relation(2, [ent_movie, ent_keyword]) rel_movie_feature = Relation(3, [ent_movie, ent_movie], is_set=True) relations = [rel_movie_actor, rel_movie_director, rel_movie_keyword, rel_movie_feature] schema = DataSchema(entities, relations) schema_out = DataSchema([ent_movie], [Relation(0, [ent_movie, ent_movie], is_set=True)]) data = SparseMatrixData(schema) for rel_i, rel_name in enumerate(relation_names): if rel_name == 'movie_feature': values = preprocess_features(raw_data[rel_name]) data[rel_i] = SparseMatrix.from_embed_diag(values) else: data[rel_i] = SparseMatrix.from_scipy_sparse(raw_data[rel_name]) data = data.to(device) indices_identity, indices_transpose = data.calculate_indices() input_channels = {rel.id: data[rel.id].n_channels for rel in relations} data_target = Data(schema_out) n_movies = ent_movie.n_instances labels = [] with open(data_file_dir + 'index_label.txt', 'r') as label_file: lines = label_file.readlines() for line in lines: label = line.rstrip().split(',')[1] labels.append(int(label)) labels = torch.LongTensor(labels).to(device) - min(labels) shuffled_indices = random.sample(range(n_movies), n_movies) val_start = 0
def load_data_flat(prefix, use_node_attrs=True, use_edge_data=True, node_val='zero', feats_type=0): ''' Load data into one matrix with all relations, reproducing Maron 2019 The first [# relation types] channels are adjacency matrices, while the next [sum of feature dimensions per entity type] channels have node attributes on the relevant segment of their diagonals if use_node_attrs=True. If node features aren't included, then ndoe_val is used instead. ''' dl = data_loader(DATA_FILE_DIR + prefix) total_n_nodes = dl.nodes['total'] entities = [Entity(0, total_n_nodes)] relations = {0: Relation(0, [entities[0], entities[0]])} schema = DataSchema(entities, relations) # Sparse Matrix containing all data data_full = sum(dl.links['data'].values()).tocoo() data_diag = scipy.sparse.coo_matrix( (np.ones(total_n_nodes), (np.arange(total_n_nodes), np.arange(total_n_nodes))), (total_n_nodes, total_n_nodes)) data_full += data_diag data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_() data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0) # Load up all edge data for rel_id in sorted(dl.links['data'].keys()): data_matrix = dl.links['data'][rel_id] data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo()) if not use_edge_data: # Use only adjacency information data_rel.values = torch.ones(data_rel.values.shape) data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full, 1) + data_rel data_out.values = torch.cat([data_out.values, data_rel_full.values], 1) data_out.n_channels += 1 target_entity = 0 if use_node_attrs: for ent_id, attr_matrix in dl.nodes['attr'].items(): start_i = dl.nodes['shift'][ent_id] n_instances = dl.nodes['count'][ent_id] if attr_matrix is None: if node_val == 'zero': attr_matrix = np.zeros((n_instances, 1)) elif node_val == 'rand': attr_matrix = np.random.randn(n_instances, 1) else: attr_matrix = np.ones((n_instances, 1)) if feats_type == 1 and ent_id != target_entity: # To keep same behaviour as non-LGNN model, use 10 dimensions attr_matrix = np.zeros((n_instances, 10)) n_channels = attr_matrix.shape[1] indices = torch.arange(start_i, start_i + n_instances).unsqueeze(0).repeat( 2, 1) data_rel = SparseMatrix( indices=indices, values=torch.FloatTensor(attr_matrix), shape=np.array([total_n_nodes, total_n_nodes, n_channels]), is_set=True) data_rel_full = SparseMatrix.from_other_sparse_matrix( data_full, n_channels) + data_rel data_out.values = torch.cat( [data_out.values, data_rel_full.values], 1) data_out.n_channels += n_channels data = SparseMatrixData(schema) data[0] = data_out n_outputs = total_n_nodes n_output_classes = dl.labels_train['num_classes'] schema_out = DataSchema([entities[target_entity]], [ Relation(0, [entities[target_entity], entities[target_entity]], is_set=True) ]) data_target = SparseMatrixData(schema_out) data_target[0] = SparseMatrix( indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1), values=torch.zeros([n_outputs, n_output_classes]), shape=(n_outputs, n_outputs, n_output_classes), is_set=True) labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']), dtype=int) val_ratio = 0.2 train_idx = np.nonzero(dl.labels_train['mask'])[0] np.random.shuffle(train_idx) split = int(train_idx.shape[0] * val_ratio) val_idx = train_idx[:split] train_idx = train_idx[split:] train_idx = np.sort(train_idx) val_idx = np.sort(val_idx) test_idx = np.nonzero(dl.labels_test['mask'])[0] labels[train_idx] = dl.labels_train['data'][train_idx] labels[val_idx] = dl.labels_train['data'][val_idx] if prefix != 'IMDB': labels = labels.argmax(axis=1) train_val_test_idx = {} train_val_test_idx['train_idx'] = train_idx train_val_test_idx['val_idx'] = val_idx train_val_test_idx['test_idx'] = test_idx return schema,\ schema_out, \ data, \ data_target, \ labels,\ train_val_test_idx,\ dl