예제 #1
0
def get_data_and_targets(schema, neg_data, data_indices, paper, cites,
                         content):
    n_papers = schema.entities[0].n_instances
    n_words = schema.entities[1].n_instances

    train_targets = torch.LongTensor(paper[1])

    # Randomly fill in values and coalesce to remove duplicates
    n_cites_neg = int(neg_data * cites.shape[1])
    #cites_neg = np.random.choice(data_indices, (2, n_cites_neg))
    cites_neg = np.random.randint(0, n_papers, (2, n_cites_neg))
    cites_matrix = SparseMatrix(
        indices=torch.LongTensor(np.concatenate((cites, cites_neg), axis=1)),
        values=torch.cat((torch.ones(cites.shape[1],
                                     1), torch.zeros(n_cites_neg, 1))),
        shape=(n_papers, n_papers, 1)).coalesce()

    # For each paper, randomly fill in values and coalesce to remove duplicates
    n_content_neg = int(neg_data * content.shape[1])
    #content_neg = np.stack((np.random.choice(data_indices, (n_content_neg,)),
    #                np.random.randint(0, n_words, (n_content_neg,))))
    content_neg = np.stack((np.random.randint(0, n_papers, (n_content_neg, )),
                            np.random.randint(0, n_words, (n_content_neg, ))))
    content_matrix = SparseMatrix(
        indices=torch.LongTensor(np.concatenate((content, content_neg),
                                                axis=1)),
        values=torch.cat((torch.ones(content.shape[1],
                                     1), torch.zeros(n_content_neg, 1))),
        shape=(n_papers, n_words, 1)).coalesce()

    data = SparseMatrixData(schema)

    data[0] = cites_matrix
    data[1] = content_matrix
    return data, train_targets
def make_flat_target_matrix(full_relation, rel_ids, pos_heads, pos_tails, neg_heads, neg_tails, device):
    full_heads, full_tails = np.array([], dtype=np.int32), np.array([], dtype=np.int32)
    for rel_id in rel_ids:
        full_heads = np.concatenate((full_heads, pos_heads[rel_id]))
        full_heads = np.concatenate((full_heads, neg_heads[rel_id]))
        full_tails = np.concatenate((full_tails, pos_tails[rel_id]))
        full_tails = np.concatenate((full_tails, neg_tails[rel_id]))
    n_rels = len(rel_ids)
    indices = torch.LongTensor(np.vstack((full_heads, full_tails)))
    values = torch.zeros((indices.shape[1], n_rels))
    shape = (full_relation.entities[0].n_instances,
             full_relation.entities[1].n_instances, n_rels)
    full_matrix = SparseMatrix(indices=indices, values=values, shape=shape)
    full_matrix = full_matrix.to(device).coalesce_()
    matrix_out = SparseMatrix.from_other_sparse_matrix(full_matrix, 0)

    for rel_id in rel_ids:
        rel_matrix = make_target_matrix(full_relation,  pos_heads[rel_id],
                                        pos_tails[rel_id], neg_heads[rel_id],
                                        neg_tails[rel_id], device)

        rel_matrix_full = SparseMatrix.from_other_sparse_matrix(full_matrix, 1) + rel_matrix
        matrix_out.values = torch.cat([matrix_out.values, rel_matrix_full.values], 1)
        matrix_out.n_channels += 1
    return matrix_out
예제 #3
0
def combine_matrices_flat(full_relation, a_pos_heads, a_pos_tails, a_neg_heads,
                          a_neg_tails, ids, b_matrix, device):
    '''
    inputs:
        a_heads: a dict of ID : head indices
        a_tails: a dict of ID : tail indices
        ids: IDs with which to access the indices of A
        b_matrix: a matrix whose indices we want to include in output

    returns:
        out_matrix: matrix with indices & values of A as well as indices of B
        valid_masks: a dict of id:indices that correspond to the indices  for
             each of the relations in A
    '''
    full_heads, full_tails = np.array([],
                                      dtype=np.int32), np.array([],
                                                                dtype=np.int32)
    for rel_id in ids:
        full_heads = np.concatenate((full_heads, a_pos_heads[rel_id]))
        full_heads = np.concatenate((full_heads, a_neg_heads[rel_id]))
        full_tails = np.concatenate((full_tails, a_pos_tails[rel_id]))
        full_tails = np.concatenate((full_tails, a_neg_tails[rel_id]))
    indices = torch.LongTensor(np.vstack((full_heads, full_tails)))
    values = torch.zeros((indices.shape[1], 1))
    shape = (full_relation.entities[0].n_instances,
             full_relation.entities[1].n_instances, 1)
    full_a_matrix = SparseMatrix(indices=indices, values=values, shape=shape)
    full_a_matrix = full_a_matrix.to(device).coalesce_()

    b_idx_matrix = SparseMatrix.from_other_sparse_matrix(b_matrix, 1)
    b_idx_matrix.values += 1

    out_idx_matrix = b_idx_matrix + full_a_matrix
    out_matrix = SparseMatrix.from_other_sparse_matrix(out_idx_matrix, 0)

    for rel_id in ids:
        rel_matrix = make_target_matrix(full_relation, a_pos_heads[rel_id],
                                        a_pos_tails[rel_id],
                                        a_neg_heads[rel_id],
                                        a_neg_tails[rel_id], device)

        rel_full_matrix = SparseMatrix.from_other_sparse_matrix(
            out_idx_matrix, 1) + rel_matrix
        out_matrix.values = torch.cat(
            [out_matrix.values, rel_full_matrix.values], 1)
        out_matrix.n_channels += 1

        rel_idx_matrix = SparseMatrix.from_other_sparse_matrix(rel_matrix, 1)
        rel_idx_matrix.values += 1
        rel_idx_full_matrix = SparseMatrix.from_other_sparse_matrix(
            out_idx_matrix, 1) + rel_idx_matrix
        out_idx_matrix.values = torch.cat(
            [out_idx_matrix.values, rel_idx_full_matrix.values], 1)
        out_idx_matrix.n_channels += 1

    masks = {}
    for channel_i, rel_id in enumerate(ids):
        masks[rel_id] = out_idx_matrix.values[:, channel_i +
                                              1].nonzero().squeeze()
    return out_matrix, masks
def make_target_matrix(relation, pos_head, pos_tail, neg_head, neg_tail, device):
    n_pos = pos_head.shape[0]
    pos_indices = np.vstack((pos_head, pos_tail))
    pos_values = np.ones((n_pos, 1))
    n_neg = neg_head.shape[0]
    neg_indices = np.vstack((neg_head, neg_tail))
    neg_values = np.zeros((n_neg, 1))
    indices = torch.LongTensor(np.concatenate((pos_indices, neg_indices), 1))
    values = torch.FloatTensor(np.concatenate((pos_values, neg_values), 0))
    shape = (relation.entities[0].n_instances,
             relation.entities[1].n_instances, 1)
    data_target = SparseMatrix(indices=indices, values=values, shape=shape)
    data_target = data_target.to(device).coalesce_()

    return data_target
예제 #5
0
    def binary_relation_to_matrix(self, relation, typedict, raw_vals,
                                  ent_id_to_idx_dict, ent_n_id_str,
                                  ent_m_id_str):
        assert not relation.is_set
        ent_n = relation.entities[0]
        ent_n_name = entity_names[ent_n.id]
        ent_m = relation.entities[1]
        ent_m_name = entity_names[ent_m.id]
        instances_n = ent_n.n_instances
        instances_m = ent_m.n_instances
        tensor_list = []
        for key, val in typedict.items():
            if val == 'id':
                continue
            elif val == 'ordinal':
                func = self.ordinal_to_tensor
            elif val == 'categorical':
                func = self.categorical_to_tensor
            elif val == 'binary':
                func = self.binary_to_tensor
            tensor_list.append(func(raw_vals[key]))
        n_ids = raw_vals[ent_n_id_str]
        m_ids = raw_vals[ent_m_id_str]
        if len(tensor_list) != 0:
            values = torch.cat(tensor_list, 1)
        else:
            values = torch.ones(len(n_ids), 1)
        indices_n = torch.LongTensor(
            [ent_id_to_idx_dict[ent_n_name][ent_i] for ent_i in n_ids])
        indices_m = torch.LongTensor(
            [ent_id_to_idx_dict[ent_m_name][ent_i] for ent_i in m_ids])

        return SparseMatrix(indices=torch.stack((indices_n, indices_m)),
                            values=values,
                            shape=(instances_n, instances_m, values.shape[1]))
예제 #6
0
    def set_relation_to_matrix(self, relation, typedict, raw_vals):
        assert relation.entities[0] == relation.entities[1]
        assert relation.is_set
        n_instances = relation.entities[0].n_instances
        tensor_list = []
        for key, val in typedict.items():
            if key == self.TARGET_KEY:
                continue
            if val == 'id':
                continue
            elif val == 'ordinal':
                func = self.ordinal_to_tensor
            elif val == 'categorical':
                func = self.categorical_to_tensor
            elif val == 'binary':
                func = self.binary_to_tensor
            tensor_list.append(func(raw_vals[key]))

        if len(tensor_list) != 0:
            values = torch.cat(tensor_list, 1)
        else:
            values = torch.ones(n_instances, 1)
        indices = torch.arange(n_instances).repeat(2, 1)
        return SparseMatrix(indices=indices,
                            values=values,
                            shape=(n_instances, n_instances, values.shape[1]))
예제 #7
0
def generate_target_matrix(true_matrix, n_samples, pos_rate, device):
    '''
    Generate a target matrix with n_samples indices, of which pos_rate
    is the proportion are true positives, while 1-pos_rate is the proportion
    of randomly generated links.
    true_matrix is a matrix containing all true positive links
    Note that the randomly generated links have a ~99.99% of being negative but
    there may be some false negatives (1e-4 sparsity for each relation)
    '''
    n_n = true_matrix.n
    n_m = true_matrix.m
    n_channels = 1
    n_pos_samples = int(pos_rate * n_samples)
    perm = torch.randperm(true_matrix.nnz())
    pos_sample_idx = perm[:n_pos_samples]
    pos_indices = true_matrix.indices[:, pos_sample_idx]
    pos_values = torch.ones(n_pos_samples).to(device)

    n_neg_samples = n_samples - n_pos_samples
    neg_indices_n = torch.randint(0, n_n, [n_neg_samples]).to(device)
    neg_indices_m = torch.randint(0, n_m, [n_neg_samples]).to(device)
    neg_indices = torch.stack((neg_indices_n, neg_indices_m))
    neg_values = torch.zeros(n_neg_samples).to(device)

    return SparseMatrix(indices=torch.cat((pos_indices, neg_indices), 1),
                        values=torch.cat((pos_values, neg_values),
                                         0).unsqueeze(1),
                        shape=(n_n, n_m, n_channels)).coalesce()
    def setUp(self):
        '''
        1ooo
        2o3o
        oooo
        4oo5
        '''
        values1 = torch.arange(1, 6, dtype=torch.float32).view(5, 1)
        indices1 = torch.LongTensor([[0, 0], [1, 0], [1, 2], [3, 0], [3, 3]]).T
        shape1 = (4, 4, 1)
        self.X = SparseMatrix(indices1, values1, shape1)
        '''
        o1o2
        oo3o
        oo4o
        5ooo
        '''
        values2 = torch.arange(1, 6, dtype=torch.float32).view(5, 1)
        indices2 = torch.LongTensor([[0, 1], [0, 3], [1, 2], [2, 2], [3, 0]]).T
        shape2 = (4, 4, 1)
        self.Y = SparseMatrix(indices2, values2, shape2)

        self.pooled = torch.arange(1, 5, dtype=torch.float32).view(4, 1)

        # Two-channeled versions:
        '''
        1ooo 6ooo
        2o3o 7o8o
        oooo oooo
        4oo5 9ooX
        '''
        values1_2 = torch.arange(1, 11, dtype=torch.float32).view(2, 5).T
        indices1 = torch.LongTensor([[0, 0], [1, 0], [1, 2], [3, 0], [3, 3]]).T
        shape1_2 = (4, 4, 2)
        self.X2 = SparseMatrix(indices1, values1_2, shape1_2)
        '''
        o1o2 o6o7
        oo3o oo8o
        oo4o oo9o
        5ooo Xooo
        '''
        values2_2 = torch.arange(1, 11, dtype=torch.float32).view(2, 5).T
        indices2 = torch.LongTensor([[0, 1], [0, 3], [1, 2], [2, 2], [3, 0]]).T
        shape2_2 = (4, 4, 2)
        self.Y2 = SparseMatrix(indices2, values2_2, shape2_2)

        self.pooled2 = torch.arange(1, 9, dtype=torch.float32).view(2, 4).T
예제 #9
0
 def to_sparse_matrix(self):
     sparse = {}
     for rel_id in self.schema.relations:
         dense = self.rel_tensors[rel_id]
         sparse[rel_id] = SparseMatrix.from_dense_tensor(dense)
     return SparseMatrixData(self.schema,
                             sparse,
                             batch_size=self.batch_size)
예제 #10
0
 def forward(self, matrix):
     values_out = self.linear(matrix.values)
     shape_out = (matrix.n, matrix.m, values_out.shape[1])
     return SparseMatrix(indices=matrix.indices,
                         values=values_out,
                         shape=shape_out,
                         indices_diag=matrix.indices_diag,
                         is_set=matrix.is_set)
 def test_broadcast_col(self):
     zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X,
                                                         n_channels=1)
     out = zero_matrix.broadcast(self.pooled, "col")
     '''
     1ooo
     2o2o
     oooo
     4oo4
     '''
     self.assertSameValues(out.values, np.array([[1, 2, 2, 4, 4]]).T)
 def test_broadcast_row(self):
     zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X,
                                                         n_channels=1)
     out = zero_matrix.broadcast(self.pooled, "row")
     '''
     1ooo
     1o3o
     oooo
     1oo4
     '''
     self.assertSameValues(out.values, np.array([[1, 1, 3, 1, 4]]).T)
 def test_broadcast_all(self):
     zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X,
                                                         n_channels=1)
     out = zero_matrix.broadcast(torch.Tensor([5.]), "all")
     '''
     5ooo
     5o5o
     oooo
     5oo5
     '''
     self.assertSameValues(out.values, np.array([[5, 5, 5, 5, 5]]).T)
 def test_broadcast_diag(self):
     zero_matrix = SparseMatrix.from_other_sparse_matrix(self.X,
                                                         n_channels=1)
     out = zero_matrix.broadcast(torch.Tensor([5., 2]), "diag")
     '''
     5ooo
     0o0o
     oooo
     0oo5
     '''
     self.assertSameValues(out.values,
                           np.array([[5, 0, 0, 0, 5], [2, 0, 0, 0, 2]]).T)
예제 #15
0
 def forward(self, X_in, X_out, indices_identity, indices_trans):
     '''
     X_in: Source sparse tensor
     X_out: Correpsonding sparse tensor for target relation
     '''
     self.logger.info("n_params: {}".format(self.n_params))
     if type(X_out) == SparseMatrix:
         Y = SparseMatrix.from_other_sparse_matrix(X_out, self.out_dim)
     else:
         Y = X_out.clone()
     #TODO: can add a cache for input operations here
     for i in range(self.n_params):
         op_inp, op_out = self.all_ops[i]
         weight = self.weights[i]
         device = weight.device
         if op_inp == None:
             X_mul = torch.matmul(X_in, weight)
             X_op_out = self.output_op(op_out, X_out, X_mul, device)
         elif op_out == None:
             X_op_inp = self.input_op(op_inp, X_in, device)
             X_mul = torch.matmul(X_op_inp, weight)
             X_op_out = X_mul
         elif op_out[0] == "i":
             # Identity
             X_intersection_vals = X_in.gather_mask(indices_identity[0])
             X_mul = X_intersection_vals @ weight
             X_op_out = X_out.broadcast_from_mask(X_mul,
                                                  indices_identity[1],
                                                  device)
         elif op_out[0] == "t":
             # Transpose
             X_T_intersection_vals = X_in.gather_transpose(indices_trans[0])
             X_mul = X_T_intersection_vals @ weight
             X_op_out = X_out.broadcast_from_mask(X_mul, indices_trans[1],
                                                  device)
         else:
             # Pool or Gather or Do Nothing
             X_op_inp = self.input_op(op_inp, X_in, device)
             # Multiply values by weight
             X_mul = torch.matmul(X_op_inp, weight)
             # Broadcast or Embed Diag or Transpose
             X_op_out = self.output_op(op_out, X_out, X_mul, device)
         #assert X_op_out.nnz() == X_out.nnz()
         #assert Y.nnz() == X_out.nnz(), "Y: {}, X_out: {}".format(Y.nnz(), X_out.nnz())
         #assert Y.nnz() == X_op_out.nnz(), "Y: {}, X_op_out: {}".format(Y.nnz(), X_op_out.nnz())
         Y = Y + X_op_out
     return Y
예제 #16
0
    def make_entity_embeddings(cls, entities, embedding_dim):
        '''
        Initialize from pytorch's built-in sparse tensor
        '''
        data = {}
        relations = {}
        for ent in entities:
            n_ent = ent.n_instances
            data[ent.id] = SparseMatrix(
                indices=torch.arange(n_ent, dtype=torch.int64).repeat(2, 1),
                values=torch.zeros([n_ent, embedding_dim]),
                shape=(n_ent, n_ent, embedding_dim),
                is_set=True)
            relations[ent.id] = Relation(ent.id, [ent, ent], is_set=True)

        embedding_schema = DataSchema(entities, relations)

        return cls(embedding_schema, data)
def select_features(data, schema, feats_type, target_ent):
    '''
    TODO: IMPLEMENT THIS
    '''
    # Select features for nodes
    in_dims = {}
    num_relations = len(schema.relations) - len(schema.entities)

    if feats_type == 0:
        # Keep all node attributes
        pass
    elif feats_type == 1:
        # Set all non-target node attributes to zero
        for ent_i in schema.entities:
            if ent_i.id != target_ent:
                # 10 dimensions for some reason
                n_dim = 10
                rel_id = num_relations + ent_i.id
                data[rel_id] = SparseMatrix.from_other_sparse_matrix(
                    data[rel_id], n_dim)
    '''
    elif feats_type == 2:
        # Set all non-target node attributes to one-hot vector
        for i in range(0, len(features_list)):
            if i != target_ent:
                dim = features_list[i].shape[0]
                indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1)
                values = torch.FloatTensor(np.ones(dim))
                features_list[i] = torch.sparse.FloatTensor(indices, values, torch.Size([dim, dim])).to(device)
    elif feats_type == 3:
        in_dims = [features.shape[0] for features in features_list]
        for i in range(len(features_list)):
            dim = features_list[i].shape[0]
            indices = np.vstack((np.arange(dim), np.arange(dim)))
            indices = torch.LongTensor(indices)
            values = np.ones(dim)
            features_list[i] = torch.sparse.FloatTensor(indices, values, torch.Size([dim, dim])).to(device)
    '''
    for rel_id in schema.relations:
        in_dims[rel_id] = data[rel_id].n_channels
    return data, in_dims
예제 #18
0
    set_seed(args.seed)

    n_papers = 200
    n_words = 300
    n_classes = 4

    value_dist = torch.distributions.bernoulli.Bernoulli(probs=1. /
                                                         (1. + args.neg_data))
    paper = np.stack(
        [np.arange(n_papers),
         np.random.randint(0, n_classes, n_papers)])

    n_cites = 2 * int(n_papers * 3)
    cites = np.unique(np.random.randint(0, n_papers, (2, n_cites)), axis=1)
    cites_matrix = SparseMatrix(indices=torch.LongTensor(cites),
                                values=value_dist.sample((cites.shape[1], 1)),
                                shape=(n_papers, n_papers, 1)).coalesce()
    n_content = 2 * int(0.2 * n_papers * n_words)

    content = np.stack([
        np.random.randint(0, n_papers, (n_content)),
        np.random.randint(0, n_words, (n_content))
    ])
    content = np.unique(cites, axis=1)
    content_matrix = SparseMatrix(indices=torch.LongTensor(content),
                                  values=value_dist.sample(
                                      (content.shape[1], 1)),
                                  shape=(n_papers, n_words, 1)).coalesce()

    ent_papers = Entity(0, n_papers)
    #ent_classes = Entity(1, n_classes)
예제 #19
0
def make_target_matrix_test(relation, left, right, labels, device):
    indices = torch.LongTensor(np.vstack((left, right)))
    values = torch.FloatTensor(labels).unsqueeze(1)
    shape = (relation.entities[0].n_instances,
             relation.entities[1].n_instances, 1)
    return SparseMatrix(indices=indices, values=values, shape=shape).to(device)
    relations = []
    rel_movie_actor = Relation(0, [ent_movie, ent_actor])
    rel_movie_director = Relation(1, [ent_movie, ent_director])
    rel_movie_keyword = Relation(2, [ent_movie, ent_keyword])
    rel_movie_feature = Relation(3, [ent_movie, ent_movie], is_set=True)
    relations = [rel_movie_actor, rel_movie_director, rel_movie_keyword, rel_movie_feature]

    schema = DataSchema(entities, relations)
    schema_out = DataSchema([ent_movie], [Relation(0, [ent_movie, ent_movie], is_set=True)])

    data = SparseMatrixData(schema)
    for rel_i, rel_name in enumerate(relation_names):
        if rel_name == 'movie_feature':
            values = preprocess_features(raw_data[rel_name])
            data[rel_i] = SparseMatrix.from_embed_diag(values)
        else:
            data[rel_i] = SparseMatrix.from_scipy_sparse(raw_data[rel_name])
    data = data.to(device)
    indices_identity, indices_transpose = data.calculate_indices()
    input_channels = {rel.id: data[rel.id].n_channels for rel in relations}
    data_target = Data(schema_out)
    n_movies = ent_movie.n_instances
    labels = []
    with open(data_file_dir + 'index_label.txt', 'r') as label_file:
        lines = label_file.readlines()
        for line in lines:
            label = line.rstrip().split(',')[1]
            labels.append(int(label))
    labels = torch.LongTensor(labels).to(device) - min(labels)
예제 #21
0
 schema = dataloader.schema
 data = dataloader.data.to(device)
 indices_identity, indices_transpose = data.calculate_indices()
 embedding_entity = schema.entities[TARGET_NODE_TYPE]
 input_channels = {
     rel.id: data[rel.id].n_channels
     for rel in schema.relations
 }
 embedding_schema = DataSchema(
     schema.entities,
     Relation(0, [embedding_entity, embedding_entity], is_set=True))
 n_instances = embedding_entity.n_instances
 data_embedding = SparseMatrixData(embedding_schema)
 data_embedding[0] = SparseMatrix(
     indices=torch.arange(n_instances, dtype=torch.int64).repeat(2, 1),
     values=torch.zeros([n_instances, args.embedding_dim]),
     shape=(n_instances, n_instances, args.embedding_dim),
     is_set=True)
 data_embedding.to(device)
 target_schema = DataSchema(schema.entities,
                            schema.relations[TARGET_REL_ID])
 target_node_idx_to_id = dataloader.target_node_idx_to_id
 #%%
 net = SparseMatrixAutoEncoder(schema,
                               input_channels,
                               layers=args.layers,
                               embedding_dim=args.embedding_dim,
                               embedding_entities=[embedding_entity],
                               activation=eval('nn.%s()' % args.act_fn),
                               final_activation=nn.Sigmoid(),
                               dropout=args.dropout_rate,
    def __init__(self, use_node_attrs=True):
        entities = [
            Entity(entity_id, n_instances)
            for entity_id, n_instances in ENTITY_N_INSTANCES.items()
        ]
        relations = [
            Relation(rel_id, [entities[entity_i], entities[entity_j]])
            for rel_id, (entity_i, entity_j) in RELATION_IDX.items()
        ]
        if use_node_attrs:
            for entity_id in ENTITY_N_INSTANCES.keys():
                rel = Relation(10 + entity_id,
                               [entities[entity_id], entities[entity_id]],
                               is_set=True)
                relations.append(rel)
        self.schema = DataSchema(entities, relations)

        self.node_id_to_idx = {ent_i: {} for ent_i in range(len(entities))}
        with open(NODE_FILE_STR, 'r') as node_file:
            lines = node_file.readlines()
            node_counter = {ent_i: 0 for ent_i in range(len(entities))}
            for line in lines:
                node_id, node_name, node_type, values = line.rstrip().split(
                    '\t')
                node_id = int(node_id)
                node_type = int(node_type)
                node_idx = node_counter[node_type]
                self.node_id_to_idx[node_type][node_id] = node_idx
                node_counter[node_type] += 1
        target_node_id_to_idx = self.node_id_to_idx[TARGET_NODE_TYPE]
        self.target_node_idx_to_id = {
            idx: id
            for id, idx in target_node_id_to_idx.items()
        }

        raw_data_indices = {rel_id: [] for rel_id in range(len(relations))}
        raw_data_values = {rel_id: [] for rel_id in range(len(relations))}
        if use_node_attrs:
            with open(NODE_FILE_STR, 'r') as node_file:
                lines = node_file.readlines()
                for line in lines:
                    node_id, node_name, node_type, values = line.rstrip(
                    ).split('\t')
                    node_type = int(node_type)
                    node_id = self.node_id_to_idx[node_type][int(node_id)]
                    values = list(map(float, values.split(',')))
                    raw_data_indices[10 + node_type].append([node_id, node_id])
                    raw_data_values[10 + node_type].append(values)

        with open(LINK_FILE_STR, 'r') as link_file:
            lines = link_file.readlines()
            for line in lines:
                node_i, node_j, rel_num, val = line.rstrip().split('\t')
                rel_num = int(rel_num)
                node_i_type, node_j_type = RELATION_IDX[rel_num]
                node_i = self.node_id_to_idx[node_i_type][int(node_i)]
                node_j = self.node_id_to_idx[node_j_type][int(node_j)]
                val = float(val)
                raw_data_indices[rel_num].append([node_i, node_j])
                raw_data_values[rel_num].append([val])

        self.data = SparseMatrixData(self.schema)
        for rel in relations:
            indices = torch.LongTensor(raw_data_indices[rel.id]).T
            values = torch.Tensor(raw_data_values[rel.id])
            n = rel.entities[0].n_instances
            m = rel.entities[1].n_instances
            n_channels = values.shape[1]
            data_matrix = SparseMatrix(indices=indices,
                                       values=values,
                                       shape=np.array([n, m, n_channels]),
                                       is_set=rel.is_set)
            del raw_data_indices[rel.id]
            del raw_data_values[rel.id]
            self.data[rel.id] = data_matrix
    train_start = int(args.val_pct * (n_targets / 100.))

    val_indices_idx = shuffled_indices_idx[val_start:train_start]
    val_indices = target_indices[val_indices_idx]

    train_indices_idx = shuffled_indices_idx[train_start:]
    train_indices = target_indices[train_indices_idx]

    #%%
    train_targets = targets[train_indices_idx]
    val_targets = targets[val_indices_idx]

    n_output_classes = len(targets.unique())
    data_target[0] = SparseMatrix(
        indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_outputs, n_output_classes]),
        shape=(n_outputs, n_outputs, n_output_classes),
        is_set=True).to(device)

    #%%
    net = SparseMatrixEntityPredictor(schema,
                                      input_channels,
                                      layers=args.layers,
                                      fc_layers=args.fc_layers,
                                      activation=eval('nn.%s()' % args.act_fn),
                                      final_activation=nn.Identity(),
                                      target_entities=schema_out.entities,
                                      dropout=args.dropout_rate,
                                      output_dim=n_output_classes,
                                      norm=args.norm,
                                      pool_op=args.pool_op,
예제 #24
0
    def __init__(self):
        data_raw = {
            rel_name: {key: list()
                       for key in schema_dict[rel_name].keys()}
            for rel_name in schema_dict.keys()
        }

        for relation_name in relation_names:
            with open(csv_file_str.format(relation_name)) as file:
                reader = csv.reader(file)
                keys = schema_dict[relation_name].keys()
                for cols in reader:
                    for key, col in zip(keys, cols):
                        data_raw[relation_name][key].append(col)

        ent_person = Entity(0, len(data_raw['person']['p_id']))
        ent_course = Entity(1, len(data_raw['course']['course_id']))
        entities = [ent_person, ent_course]

        rel_person = Relation(0, [ent_person, ent_person], is_set=True)
        rel_course = Relation(1, [ent_course, ent_course], is_set=True)
        rel_advisedBy = Relation(2, [ent_person, ent_person])
        rel_taughtBy = Relation(3, [ent_course, ent_person])
        relations = [rel_person, rel_course, rel_advisedBy, rel_taughtBy]

        self.schema = DataSchema(entities, relations)
        self.data = SparseMatrixData(self.schema)

        ent_id_to_idx_dict = {
            'person': self.id_to_idx(data_raw['person']['p_id']),
            'course': self.id_to_idx(data_raw['course']['course_id'])
        }

        for relation in relations:
            relation_name = relation_names[relation.id]
            print(relation_name)
            if relation.is_set:
                data_matrix = self.set_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name])
            else:
                if relation_name == 'advisedBy':
                    ent_n_id_str = 'p_id'
                    ent_m_id_str = 'p_id_dummy'
                elif relation_name == 'taughtBy':
                    ent_n_id_str = 'course_id'
                    ent_m_id_str = 'p_id'
                data_matrix = self.binary_relation_to_matrix(
                    relation, schema_dict[relation_name],
                    data_raw[relation_name], ent_id_to_idx_dict, ent_n_id_str,
                    ent_m_id_str)
            self.data[relation.id] = data_matrix

        self.target = self.get_targets(
            data_raw[self.TARGET_RELATION][self.TARGET_KEY],
            schema_dict[self.TARGET_RELATION][self.TARGET_KEY])
        self.target_rel_id = 0
        rel_out = Relation(self.target_rel_id, [ent_person, ent_person],
                           is_set=True)
        self.schema_out = DataSchema([ent_person], [rel_out])
        self.data_target = Data(self.schema_out)
        n_output_classes = len(
            np.unique(data_raw[self.TARGET_RELATION][self.TARGET_KEY]))
        self.output_dim = n_output_classes
        n_person = ent_person.n_instances
        self.data_target[self.target_rel_id] = SparseMatrix(
            indices=torch.arange(n_person, dtype=torch.int64).repeat(2, 1),
            values=torch.zeros([n_person, n_output_classes]),
            shape=(n_person, n_person, n_output_classes))
예제 #25
0
def load_data_flat(prefix,
                   use_node_attrs=True,
                   use_edge_data=True,
                   node_val='one'):
    '''
    Load data into one matrix with all relations, reproducing Maron 2019
    The first [# relation types] channels are adjacency matrices,
    while the next [sum of feature dimensions per entity type] channels have
    node attributes on the relevant segment of their diagonals if use_node_attrs=True.
    If node features aren't included, then ndoe_val is used instead.
    '''
    dl = data_loader(DATA_FILE_DIR + prefix)
    total_n_nodes = dl.nodes['total']
    entities = [Entity(0, total_n_nodes)]
    relations = {0: Relation(0, [entities[0], entities[0]])}
    schema = DataSchema(entities, relations)

    # Sparse Matrix containing all data
    data_full = sum(dl.links['data'].values()).tocoo()
    data_diag = scipy.sparse.coo_matrix(
        (np.ones(total_n_nodes),
         (np.arange(total_n_nodes), np.arange(total_n_nodes))),
        (total_n_nodes, total_n_nodes))
    data_full += data_diag
    data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_()
    data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0)
    # Load up all edge data
    for rel_id in sorted(dl.links['data'].keys()):
        data_matrix = dl.links['data'][rel_id]
        data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data_rel.values = torch.ones(data_rel.values.shape)
        data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full,
                                                              1) + data_rel
        data_out.values = torch.cat([data_out.values, data_rel_full.values], 1)
        data_out.n_channels += 1

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            start_i = dl.nodes['shift'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            n_channels = attr_matrix.shape[1]
            indices = torch.arange(start_i,
                                   start_i + n_instances).unsqueeze(0).repeat(
                                       2, 1)
            data_rel = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([total_n_nodes, total_n_nodes, n_channels]),
                is_set=True)
            data_rel_full = SparseMatrix.from_other_sparse_matrix(
                data_full, n_channels) + data_rel
            data_out.values = torch.cat(
                [data_out.values, data_rel_full.values], 1)
            data_out.n_channels += n_channels

    data = SparseMatrixData(schema)
    data[0] = data_out

    return schema,\
           data, \
           dl
예제 #26
0
def load_data(prefix,
              use_node_attrs=True,
              use_edge_data=True,
              use_other_edges=True,
              node_val='one'):
    dl = data_loader(DATA_FILE_DIR + prefix)

    all_entities = [
        Entity(entity_id, n_instances)
        for entity_id, n_instances in sorted(dl.nodes['count'].items())
    ]

    relations = {}
    test_types = dl.test_types
    if use_other_edges:
        for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items()):
            relations[rel_id] = Relation(
                rel_id, [all_entities[entity_i], all_entities[entity_j]])

    else:
        for rel_id in test_types:
            entity_i, entity_j = dl.links['meta'][rel_id]
            relations[rel_id] = Relation(
                rel_id, [all_entities[entity_i], all_entities[entity_j]])

    if use_other_edges:
        entities = all_entities
    else:
        entities = list(np.unique(relations[test_types[0]].entities))

    max_relation = max(relations) + 1
    if use_node_attrs:
        # Create fake relations to represent node attributes
        for entity in entities:
            rel_id = max_relation + entity.id
            relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True)
    schema = DataSchema(entities, relations)

    data = SparseMatrixData(schema)
    for rel_id, data_matrix in dl.links['data'].items():
        if use_other_edges or rel_id in test_types:
            # Get subset belonging to entities in relation
            relation = relations[rel_id]
            start_i = dl.nodes['shift'][relation.entities[0].id]
            end_i = start_i + dl.nodes['count'][relation.entities[0].id]
            start_j = dl.nodes['shift'][relation.entities[1].id]
            end_j = start_j + dl.nodes['count'][relation.entities[1].id]
            rel_matrix = data_matrix[start_i:end_i, start_j:end_j]
            data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo())
            if not use_edge_data:
                # Use only adjacency information
                data[rel_id].values = torch.ones(data[rel_id].values.shape)

    if use_node_attrs:
        for ent in entities:
            ent_id = ent.id
            attr_matrix = dl.nodes['attr'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            n_channels = attr_matrix.shape[1]
            rel_id = ent_id + max_relation
            indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1)
            data[rel_id] = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([n_instances, n_instances, n_channels]),
                is_set=True)

    return schema,\
           data, \
           dl
예제 #27
0
def load_data(prefix='DBLP',
              use_node_attrs=True,
              use_edge_data=True,
              feats_type=0):
    dl = data_loader(DATA_FILE_DIR + prefix)

    # Create Schema
    entities = [
        Entity(entity_id, n_instances)
        for entity_id, n_instances in sorted(dl.nodes['count'].items())
    ]
    relations = {
        rel_id: Relation(rel_id, [entities[entity_i], entities[entity_j]])
        for rel_id, (entity_i, entity_j) in sorted(dl.links['meta'].items())
    }
    num_relations = len(relations)
    if use_node_attrs:
        # Create fake relations to represent node attributes
        for entity in entities:
            rel_id = num_relations + entity.id
            relations[rel_id] = Relation(rel_id, [entity, entity], is_set=True)
    schema = DataSchema(entities, relations)

    # Collect data
    data = SparseMatrixData(schema)
    for rel_id, data_matrix in dl.links['data'].items():
        # Get subset belonging to entities in relation
        start_i = dl.nodes['shift'][relations[rel_id].entities[0].id]
        end_i = start_i + dl.nodes['count'][relations[rel_id].entities[0].id]
        start_j = dl.nodes['shift'][relations[rel_id].entities[1].id]
        end_j = start_j + dl.nodes['count'][relations[rel_id].entities[1].id]
        rel_matrix = data_matrix[start_i:end_i, start_j:end_j]
        data[rel_id] = SparseMatrix.from_scipy_sparse(rel_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data[rel_id].values = torch.ones(data[rel_id].values.shape)

    target_entity = 0

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            if attr_matrix is None:
                # Attribute for each node is a single 1
                attr_matrix = np.ones(dl.nodes['count'][ent_id])[:, None]
            n_channels = attr_matrix.shape[1]
            rel_id = ent_id + num_relations
            n_instances = dl.nodes['count'][ent_id]
            indices = torch.arange(n_instances).unsqueeze(0).repeat(2, 1)
            data[rel_id] = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([n_instances, n_instances, n_channels]),
                is_set=True)

    n_outputs = dl.nodes['count'][target_entity]
    n_output_classes = dl.labels_train['num_classes']
    schema_out = DataSchema([entities[target_entity]], [
        Relation(0, [entities[target_entity], entities[target_entity]],
                 is_set=True)
    ])
    data_target = SparseMatrixData(schema_out)
    data_target[0] = SparseMatrix(
        indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_outputs, n_output_classes]),
        shape=(n_outputs, n_outputs, n_output_classes),
        is_set=True)
    labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']),
                      dtype=int)
    val_ratio = 0.2
    train_idx = np.nonzero(dl.labels_train['mask'])[0]
    np.random.shuffle(train_idx)
    split = int(train_idx.shape[0] * val_ratio)
    val_idx = train_idx[:split]
    train_idx = train_idx[split:]
    train_idx = np.sort(train_idx)
    val_idx = np.sort(val_idx)
    test_idx = np.nonzero(dl.labels_test['mask'])[0]
    labels[train_idx] = dl.labels_train['data'][train_idx]
    labels[val_idx] = dl.labels_train['data'][val_idx]
    if prefix != 'IMDB':
        labels = labels.argmax(axis=1)
    train_val_test_idx = {}
    train_val_test_idx['train_idx'] = train_idx
    train_val_test_idx['val_idx'] = val_idx
    train_val_test_idx['test_idx'] = test_idx
    return schema,\
           schema_out, \
           data, \
           data_target, \
           labels,\
           train_val_test_idx,\
           dl
예제 #28
0
    if args.training_data == 'val':
        train_data = val_data
        indices_identity = idx_id_val
        indices_transpose = idx_trans_val
    elif args.training_data == 'test':
        train_data = test_data
        indices_identity = idx_id_test
        indices_transpose = idx_trans_test

        val_data = test_data
        idx_id_val = indices_identity
        idx_trans_val = indices_transpose

    data_target = Data(schema_out)
    data_target[0] = SparseMatrix(
        indices=torch.arange(len(paper_names), dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([len(paper_names), n_classes]),
        shape=(len(paper_names), len(paper_names), n_classes))
    data_target = data_target.to(device)

    #%%

    # Loss function:
    def classification_loss(data_pred, data_true):
        return F.cross_entropy(data_pred, data_true)

    n_channels = 1
    net = SparseMatrixEntityPredictor(schema,
                                      n_channels,
                                      layers=args.layers,
                                      fc_layers=args.fc_layers,
                                      activation=eval('nn.%s()' % args.act_fn),
예제 #29
0
def load_data_flat(prefix,
                   use_node_attrs=True,
                   use_edge_data=True,
                   node_val='zero',
                   feats_type=0):
    '''
    Load data into one matrix with all relations, reproducing Maron 2019
    The first [# relation types] channels are adjacency matrices,
    while the next [sum of feature dimensions per entity type] channels have
    node attributes on the relevant segment of their diagonals if use_node_attrs=True.
    If node features aren't included, then ndoe_val is used instead.
    '''
    dl = data_loader(DATA_FILE_DIR + prefix)
    total_n_nodes = dl.nodes['total']
    entities = [Entity(0, total_n_nodes)]
    relations = {0: Relation(0, [entities[0], entities[0]])}
    schema = DataSchema(entities, relations)

    # Sparse Matrix containing all data
    data_full = sum(dl.links['data'].values()).tocoo()
    data_diag = scipy.sparse.coo_matrix(
        (np.ones(total_n_nodes),
         (np.arange(total_n_nodes), np.arange(total_n_nodes))),
        (total_n_nodes, total_n_nodes))
    data_full += data_diag
    data_full = SparseMatrix.from_scipy_sparse(data_full.tocoo()).zero_()
    data_out = SparseMatrix.from_other_sparse_matrix(data_full, 0)
    # Load up all edge data
    for rel_id in sorted(dl.links['data'].keys()):
        data_matrix = dl.links['data'][rel_id]
        data_rel = SparseMatrix.from_scipy_sparse(data_matrix.tocoo())
        if not use_edge_data:
            # Use only adjacency information
            data_rel.values = torch.ones(data_rel.values.shape)
        data_rel_full = SparseMatrix.from_other_sparse_matrix(data_full,
                                                              1) + data_rel
        data_out.values = torch.cat([data_out.values, data_rel_full.values], 1)
        data_out.n_channels += 1

    target_entity = 0

    if use_node_attrs:
        for ent_id, attr_matrix in dl.nodes['attr'].items():
            start_i = dl.nodes['shift'][ent_id]
            n_instances = dl.nodes['count'][ent_id]
            if attr_matrix is None:
                if node_val == 'zero':
                    attr_matrix = np.zeros((n_instances, 1))
                elif node_val == 'rand':
                    attr_matrix = np.random.randn(n_instances, 1)
                else:
                    attr_matrix = np.ones((n_instances, 1))
            if feats_type == 1 and ent_id != target_entity:
                # To keep same behaviour as non-LGNN model, use 10 dimensions
                attr_matrix = np.zeros((n_instances, 10))
            n_channels = attr_matrix.shape[1]
            indices = torch.arange(start_i,
                                   start_i + n_instances).unsqueeze(0).repeat(
                                       2, 1)
            data_rel = SparseMatrix(
                indices=indices,
                values=torch.FloatTensor(attr_matrix),
                shape=np.array([total_n_nodes, total_n_nodes, n_channels]),
                is_set=True)
            data_rel_full = SparseMatrix.from_other_sparse_matrix(
                data_full, n_channels) + data_rel
            data_out.values = torch.cat(
                [data_out.values, data_rel_full.values], 1)
            data_out.n_channels += n_channels

    data = SparseMatrixData(schema)
    data[0] = data_out

    n_outputs = total_n_nodes
    n_output_classes = dl.labels_train['num_classes']
    schema_out = DataSchema([entities[target_entity]], [
        Relation(0, [entities[target_entity], entities[target_entity]],
                 is_set=True)
    ])
    data_target = SparseMatrixData(schema_out)
    data_target[0] = SparseMatrix(
        indices=torch.arange(n_outputs, dtype=torch.int64).repeat(2, 1),
        values=torch.zeros([n_outputs, n_output_classes]),
        shape=(n_outputs, n_outputs, n_output_classes),
        is_set=True)
    labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_classes']),
                      dtype=int)
    val_ratio = 0.2
    train_idx = np.nonzero(dl.labels_train['mask'])[0]
    np.random.shuffle(train_idx)
    split = int(train_idx.shape[0] * val_ratio)
    val_idx = train_idx[:split]
    train_idx = train_idx[split:]
    train_idx = np.sort(train_idx)
    val_idx = np.sort(val_idx)
    test_idx = np.nonzero(dl.labels_test['mask'])[0]
    labels[train_idx] = dl.labels_train['data'][train_idx]
    labels[val_idx] = dl.labels_train['data'][val_idx]
    if prefix != 'IMDB':
        labels = labels.argmax(axis=1)
    train_val_test_idx = {}
    train_val_test_idx['train_idx'] = train_idx
    train_val_test_idx['val_idx'] = val_idx
    train_val_test_idx['test_idx'] = test_idx

    return schema,\
           schema_out, \
           data, \
           data_target, \
           labels,\
           train_val_test_idx,\
           dl