Пример #1
0
    dataset = Planetoid(root='tmp', name='PubMed')
    print("use dataset: PubMed")
data = dataset[0]

enhanced_data = train_test_split_edges(data.clone(),
                                       val_ratio=0.1,
                                       test_ratio=0.2)

train_data = Data(x=enhanced_data.x,
                  edge_index=enhanced_data['train_pos_edge_index']).to(DEVICE)
target_data = data.to(DEVICE)

if args.model is 'VGAE':
    model = VGAE(encoder=VEncoder(data['x'].shape[1])).to(DEVICE)
else:
    model = GAE(encoder=Encoder(data['x'].shape[1])).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(),
                             lr=args.learning_rate,
                             weight_decay=5e-4)


def model_train():
    print("========Start training========")
    for epoch in range(args.num_epoch):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data)
        recon_loss = model.recon_loss(z, target_data['edge_index'])
        if args.model is 'VGAE':
            recon_loss += model.kl_loss() / data['x'].shape[0]
Пример #2
0
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=False)
        # Map into 2*out_channels dimentions with

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)


#This is the size of the latent embedding
channels = 32
# We have 75 origional features
num_features = 75
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#dev = torch.device('cpu')

model = GAE(Encoder(num_features, channels).to(dev))
#data.train_mask = data.val_mask = data.test_mask = data.y = None
#data = model.split_edges(data)
#x, train_edge_index = data.x.to(dev), data.edge_index.to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)


def train(loader):
    model.train()
    loss_all = 0
    for data in loader:
        x, train_edge_index = data.x.to(dev), data.edge_index.to(dev)
        optimizer.zero_grad()
        z = model.encode(x, train_edge_index)
        loss = model.recon_loss(z, train_edge_index)
        loss.backward()
Пример #3
0
def run_GAE(input_data, output_dir, epochs=1000, lr=0.01, weight_decay=0.0005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Device: '.ljust(32), device)
    print('Model Name: '.ljust(32), 'GAE')
    print('Model params:{:19} lr: {}   weight_decay: {}'.format(
        '', lr, weight_decay))
    print('Total number of epochs to run: '.ljust(32), epochs)
    print('*' * 70)

    data = input_data.clone().to(device)
    in_channels = data.num_features
    out_channels = data.num_classes.item()
    model = GAE(GAEncoder(in_channels, out_channels)).to(device)
    data = input_data.clone().to(device)
    split_data = model.split_edges(data)
    x, train_pos_edge_index, edge_attr = split_data.x.to(
        device), split_data.train_pos_edge_index.to(device), data.edge_attr.to(
            device)
    split_data.train_idx = split_data.test_idx = data.y = None
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
    train_losses, test_losses = [], []
    aucs = []
    aps = []
    model.train()
    for epoch in range(1, epochs + 1):
        train_loss = 0
        test_loss = 0
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        train_loss = model.recon_loss(z, train_pos_edge_index)
        train_losses.append(train_loss)
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
        auc, ap = model.test(z, split_data.test_pos_edge_index,
                             split_data.test_neg_edge_index)
        test_loss = model.recon_loss(z, data.test_pos_edge_index)
        test_losses.append(test_loss.item())
        aucs.append(auc)
        aps.append(ap)

        figname = os.path.join(
            output_dir, "_".join((GAE.__name__, str(lr), str(weight_decay))))
        makepath(output_dir)

        if (epoch % int(epochs / 10) == 0):
            print(
                'Epoch: {}       Train loss: {}    Test loss: {}     AUC: {}    AP: {}'
                .format(epoch, train_loss, test_loss, auc, ap))
        if (epoch == epochs):
            print(
                '-' * 65,
                '\nFinal epoch: {}    Train loss: {}    Test loss: {}    AUC: {}    AP: {}'
                .format(epoch, train_loss, test_loss, auc, ap))
        log = 'Final epoch: {}    Train loss: {}    Test loss: {}    AUC: {}    AP: {}'.format(
            epoch, train_loss, test_loss, auc, ap)
        write_log(log, figname)
    print('-' * 65)

    plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs,
                  figname)
    return
                    help='Residual connection')
args = parser.parse_args()

#download datasets
path = os.join(os.dirname(os.realpath(__file__)), '..', 'data', args.dataset)
dataset = Planetoid(path, args.dataset)

dev = torch.device(args.dev)

if args.model == VGAE:
    model = VGAE(
        Encoder_VGAE(dataset.num_features, args.hidden1, args.hidden2,
                     args.depth, args.res)).to(dev)
else:
    model = GAE(
        Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2,
                    args.depth, args.res)).to(dev)

auc_score_list = []
ap_score_list = []

print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :",
      args.res, ", Layer depth:", args.depth, " ")

for i in range(1, args.runs + 1):
    data = dataset[0]
    data.train_mask = data.val_mask = data.test_mask = data.y = None
    data = train_test_split_edges(data)

    x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
act = F.relu
sum_res = True
variational = False

path = Path(__file__).parent / "../../test/data/BBA-subset-100.h5"
node_feature_path = (
    Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy"
)
dataset = ContactMapDataset(
    path, "contact_map", ["rmsd"], node_feature_path=node_feature_path
)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Select node AE
if args.linear:
    node_ae = GAE(LinearEncoder(num_features, node_out_channels))
else:
    node_ae = GAE(GCNEncoder(num_features, node_out_channels))

# Select graph AE
encoder = VariationalGraphEncoder(
    node_out_channels,
    hidden_channels,
    graph_out_channels,
    depth,
    pool_ratios,
    act,
    variational,
)
decoder = VariationalGraphDecoder(
    graph_out_channels,
Пример #6
0
class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(VariationalLinearEncoder, self).__init__()
        self.conv_mu = GCNConv(in_channels, out_channels, cached=True)
        self.conv_logstd = GCNConv(in_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


out_channels = 16
num_features = dataset.num_features

if not args.variational:
    if not args.linear:
        model = GAE(GCNEncoder(num_features, out_channels))
    else:
        model = GAE(LinearEncoder(num_features, out_channels))
else:
    if args.linear:
        model = VGAE(VariationalLinearEncoder(num_features, out_channels))
    else:
        model = VGAE(VariationalGCNEncoder(num_features, out_channels))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = data.x.to(device)
train_pos_edge_index = data.train_pos_edge_index.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

Пример #7
0
def perturb_edges(data,
                  name,
                  remove_pct,
                  add_pct,
                  hidden_channels=16,
                  epochs=400):
    if remove_pct == 0 and add_pct == 0:
        return
    try:
        cached = pickle.load(
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb'))
        print(f'Use cached edge augmentation for dataset {name}')

        if data.setting == 'inductive':
            data.train_edge_index = cached
        else:
            data.edge_index = cached
        return
    except FileNotFoundError:
        try:
            A_pred, adj_orig = pickle.load(
                open(f'{ROOT}/cache/edge/{name}.pt', 'rb'))
            A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct)
            data.edge_index, _ = from_scipy_sparse_matrix(A)
            pickle.dump(
                data.edge_index,
                open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt',
                     'wb'))
            return
        except FileNotFoundError:
            print(
                f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now'
            )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if data.setting == 'inductive':
        train_data = Data(x=data.train_x,
                          ori_x=data.ori_x,
                          edge_index=data.train_edge_index,
                          y=data.train_y)
    else:
        train_data = deepcopy(data)

    edge_index = deepcopy(train_data.edge_index)
    train_data = train_test_split_edges(train_data,
                                        val_ratio=0.1,
                                        test_ratio=0)
    num_features = train_data.ori_x.shape[1]
    model = GAE(GCNEncoder(num_features, hidden_channels))
    model = model.to(device)
    x = train_data.ori_x.to(device)
    train_pos_edge_index = train_data.train_pos_edge_index.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    best_val_auc = 0
    best_z = None
    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, train_pos_edge_index)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)

        auc, ap = model.test(z, train_data.val_pos_edge_index,
                             train_data.val_neg_edge_index)
        print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(
            epoch, auc, ap))
        if auc > best_val_auc:
            best_val_auc = auc
            best_z = deepcopy(z)

    A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy()

    adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr')
    adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct)

    if data.setting == 'inductive':
        data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred)
    else:
        data.edge_index, _ = from_scipy_sparse_matrix(adj_pred)

    pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb'))

    if data.setting == 'inductive':
        pickle.dump(
            data.train_edge_index,
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
    else:
        pickle.dump(
            data.edge_index,
            open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
Пример #8
0
def run_model(dataset, conf):
    # ## 1) Build Table graph
    # ### Tables tokenization
    tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(
        dataset, include_attr=conf["add_attr"])
    if conf["shuffle_vocab"] == True:
        shuffled_vocab = shuffle_vocabulary(vocabulary)
    else:
        shuffled_vocab = None

    nodes = build_node_features(vocabulary)
    row_edges_index, row_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["row_edges_sample"],
        columns=False)
    col_edges_index, col_edges_weights = build_graph_edges(
        tokenized_tables,
        s_vocab=shuffled_vocab,
        sample_frac=conf["column_edges_sample"],
        columns=True)

    edges = torch.cat((row_edges_index, col_edges_index), dim=1)
    weights = torch.cat((row_edges_weights, col_edges_weights), dim=0)
    graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights)

    # ## 2 ) Run Table Auto-Encoder Model:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    loader = DataLoader(torch.arange(graph_data.num_nodes),
                        batch_size=128,
                        shuffle=True)
    graph_data = graph_data.to(device)

    x, train_pos_edge_index = nodes, edges

    class Encoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super(Encoder, self).__init__()
            self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
            self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
            self.conv_logvar = GCNConv(2 * out_channels,
                                       out_channels,
                                       cached=True)

        def forward(self, x, edge_index):
            x = F.relu(self.conv1(x, edge_index))
            return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)

    channels = conf["vector_size"]
    enc = Encoder(graph_data.num_features, channels)
    model = GAE(enc)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    def train(model, optimizer, x, train_pos_edge_index):
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, train_pos_edge_index)
        #loss = model.kl_loss()

        loss.backward()
        optimizer.step()
        return loss

    losses = []
    for epoch in range(conf["epoch_num"]):
        loss = train(model, optimizer, x, train_pos_edge_index)
        losses.append(loss)
        print(epoch, loss)
        losses.append(loss)
    # ### 3) Extract the latent cell vectors, generate table vectors:
    def get_cell_vectors(model, x, train_pos_edge_index):
        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
            cell_vectors = z.numpy()
        return z, cell_vectors

    z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index)

    vec_list = generate_table_vectors(cell_vectors,
                                      tokenized_tables,
                                      s_vocab=shuffled_vocab)

    # ## 3) Evaluate the model
    result_score = evaluate_model(dataset, vec_list, k=5)
    return cell_vectors, vec_list, losses, result_score
Пример #9
0
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


if __name__ == "__main__":
    filePath = '../wholeYear/' if len(sys.argv) > 1 else sys.argv[2]
    dataset = WholeYearDataset(filePath)
    d = dataset[0]

    train_test_split_edges(d)

    #parameters
    out_channels = 2
    num_features = d.num_features

    model_gae1 = GAE(GCNEncoder(num_features, out_channels))
    areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday = runAutoencoder(
        model_gae1, d, 1000, torch.optim.Adam, 0.001)
    plotAUC_AP_Loss(areasUnderCurve_gae_weekday, precisions_gae_weekday,
                    losses_gae_weekday, 1000, "GAE 1: 2 Convolutions")

    model2 = GAE(GCNEncoder2(num_features, out_channels))
    areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2 = runAutoencoder(
        model2, d, 1000, torch.optim.Adam, 0.001)
    plotAUC_AP_Loss(areasUnderCurve_gae_weekday_model2,
                    precisions_gae_weekday_model2, losses_gae_weekday_model2,
                    1000, "GAE 2: 2 Convolutions 1 Linear")

    modelVgae = VGAE(VariationalGCNEncoder(num_features, out_channels))
    runVariational1 = runVariational(modelVgae, d, 1000, torch.optim.Adam,
                                     0.001)
Пример #10
0
        super().__init__()
        self.rel_emb = Parameter(torch.Tensor(num_relations, hidden_channels))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.rel_emb)

    def forward(self, z, edge_index, edge_type):
        z_src, z_dst = z[edge_index[0]], z[edge_index[1]]
        rel = self.rel_emb[edge_type]
        return torch.sum(z_src * rel * z_dst, dim=1)


model = GAE(
    RGCNEncoder(data.num_nodes,
                hidden_channels=500,
                num_relations=dataset.num_relations),
    DistMultDecoder(dataset.num_relations // 2, hidden_channels=500),
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def negative_sampling(edge_index, num_nodes):
    # Sample edges by corrupting either the subject or the object of each edge.
    mask_1 = torch.rand(edge_index.size(1)) < 0.5
    mask_2 = ~mask_1

    neg_edge_index = edge_index.clone()
    neg_edge_index[0, mask_1] = torch.randint(num_nodes, (mask_1.sum(), ))
    neg_edge_index[1, mask_2] = torch.randint(num_nodes, (mask_2.sum(), ))
    return neg_edge_index
Пример #11
0
data = dataset[0]


class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAE(Encoder(dataset.num_features, out_channels=32)).to(device)
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = model.split_edges(data)
x, edge_index = data.x.to(device), data.edge_index.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, edge_index)
    loss = model.recon_loss(z, data.train_pos_edge_index,
                            data.train_neg_adj_mask)
    loss.backward()
    optimizer.step()