示例#1
0
def test_():
    print("Computing feature maps...")
    Q, subgraphs, labels, shapes = compute_nystrom(use_node_labels, dim,
                                                   community_detection,
                                                   kernels)
    M = np.zeros((shapes[0], shapes[1], len(kernels)))
    for idx, k in enumerate(kernels):
        M[:, :, idx] = Q[idx]
    Q = M
    # Binarize labels
    le = LabelEncoder()
    y = le.fit_transform(labels)
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in subgraphs])
    x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32)
    for i in range(len(subgraphs)):
        communities = subgraphs[i].split()
        for j in range(len(communities)):
            x[i, j] = int(communities[j])
    reg = x[0:2500]
    gen = x[2500:5000]
    mal = x[5000:]
    reg_label = y[:2500]
    gen_label = y[2500:5000]
    mal_label = y[5000:]

    train_reg = reg[0:1500]
    test_reg = reg[1500:]
    train_reg_y = reg_label[0:1500]
    test_reg_y = reg_label[1500:]

    train_mal = mal[0:1500]
    test_mal = mal[1500:]
    train_mal_y = mal_label[0:1500]
    test_mal_y = mal_label[1500:]

    train_gen = gen[0:1500]
    train_gen_y = gen_label[0:1500]

    train_fake = np.concatenate((train_reg, train_gen), axis=0)
    y_train_fake = np.concatenate((train_reg_y, train_gen_y), axis=0)
    train_real = np.concatenate((train_reg, train_mal), axis=0)
    y_train_real = np.concatenate((train_reg_y, train_mal_y), axis=0)
    test = np.concatenate((test_reg, test_mal), axis=0)
    y_test = np.concatenate((test_reg_y, test_mal_y), axis=0)

    def train_test(Q, x_train, x_test, y_train, y_test, batch_size):
        train_loader, test_loader = create_train_test_loaders(
            Q, x_train, x_test, y_train, y_test, batch_size)
        cnn = CNN(input_size=num_filters,
                  hidden_size=hidden_size,
                  num_classes=np.unique(y).size,
                  dim=dim,
                  num_kernels=num_kernels,
                  max_document_length=max_document_length)
        if torch.cuda.is_available():
            cnn.cuda()
        if torch.cuda.is_available():
            criterion = nn.CrossEntropyLoss().cuda()
        else:
            criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)
        for epoch in range(num_epochs):
            for i, (graphs, labels) in enumerate(train_loader):
                graphs = Variable(graphs)
                labels = Variable(labels)
                optimizer.zero_grad()
                outputs = cnn(graphs)
                if torch.cuda.is_available():
                    loss = criterion(outputs, labels.cuda())
                else:
                    loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

# Test the Model

        cnn.eval()
        correct = 0
        total = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        predict = []
        label = []
        output = []
        for graphs, labels in test_loader:
            graphs = Variable(graphs)
            outputs = cnn(graphs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.cuda()).sum()
            TP += (predicted + labels.cuda() == 2).sum()
            FP += (predicted * 5 + labels.cuda() * 1 == 5).sum()
            FN += (predicted * 1 + labels.cuda() * 5 == 5).sum()
            TN += (predicted + labels.cuda() == 0).sum()
            predict.append(predicted)
            label.append(labels)
            output.append(outputs.data)
        if TP + FP == 0: precision = 0
        else: precision = TP / (TP + FP)
        if TP + FN == 0: recall = 0
        else: recall = TP / (TP + FN)
        l = np.zeros((len(label)))
        for i in range(len(label)):
            l[i] = int(label[i])
        s = np.zeros((len(output)))
        for i in range(len(output)):
            s[i] = output[i][0][1]
        return TP, TN, FP, FN, precision, recall, l, s

    TP_fake, TN_fake, FP_fake, FN_fake, precision_fake, recall_fake, l_fake, s_fake = train_test(
        Q, train_fake, test, y_train_fake, y_test, batch_size)
    TP_real, TN_real, FP_real, FN_real, precision_real, recall_real, l_real, s_real = train_test(
        Q, train_real, test, y_train_real, y_test, batch_size)
    return TP_fake, TN_fake, FP_fake, FN_fake, precision_fake, recall_fake, l_fake, s_fake, TP_real, TN_real, FP_real, FN_real, precision_real, recall_real, l_real, s_real
示例#2
0
def main():
    global args
    args = parser.parse_args()

    # Check if CUDA is enabled
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    unlabeled_datasets = [
        "IMDB-BINARY", "IMDB-MULTI", "REDDIT-BINARY", "REDDIT-MULTI-5K",
        "COLLAB", "SYNTHETIC", "raw-gitgraph"
    ]
    if args.dataset in unlabeled_datasets:
        use_node_labels = False
        from graph_kernels import sp_kernel, wl_kernel
    else:
        use_node_labels = True
        from graph_kernels_labeled import sp_kernel, wl_kernel

    kernels = [wl_kernel]
    n_kernels = len(kernels)

    print('Computing graph maps')
    Q, subgraphs, labels, shapes = compute_nystrom(args.dataset,
                                                   use_node_labels, args.d,
                                                   args.community_detection,
                                                   kernels)

    M = np.zeros((shapes[0], shapes[1], n_kernels))
    for idx, k in enumerate(kernels):
        M[:, :, idx] = Q[idx]

    Q = M

    # Binarize labels
    le = LabelEncoder()
    y = le.fit_transform(labels)

    # Build vocabulary
    max_n_communities = max([len(x.split(" ")) for x in subgraphs])
    x = np.zeros((len(subgraphs), max_n_communities), dtype=np.int32)
    for i in range(len(subgraphs)):
        communities = subgraphs[i].split()
        for j in range(len(communities)):
            x[i, j] = int(communities[j])

    print(x[0, :])

    kf = StratifiedKFold(n_splits=10, random_state=None)
    kf.shuffle = True
    accs = []
    it = 0

    print('Starting cross-validation')

    for train_index, test_index in kf.split(x, y):
        it += 1
        best_acc1 = 0

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                          y_train,
                                                          test_size=0.1)

        train_loader, val_loader, test_loader = create_train_val_test_loaders(
            Q, x_train, x_val, x_test, y_train, y_val, y_test, args.batch_size)

        print('\tCreate model')
        model = CNN(input_size=args.n_filters,
                    hidden_size=args.hidden_size,
                    n_classes=np.unique(y).size,
                    d=args.d,
                    n_kernels=n_kernels,
                    max_n_communities=max_n_communities)

        print('Optimizer')
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

        criterion = nn.CrossEntropyLoss()

        evaluation = lambda output, target: torch.sum(output.eq(target)
                                                      ) / target.size()[0]

        lr = args.lr
        lr_step = (args.lr - args.lr * args.lr_decay) / (
            args.epochs * args.schedule[1] - args.epochs * args.schedule[0])

        if os.path.isdir(args.checkpoint_dir):
            shutil.rmtree(args.checkpoint_dir)

        os.makedirs(args.checkpoint_dir)

        print('Check cuda')
        if args.cuda:
            print('\t* Cuda')
            model = model.cuda()
            criterion = criterion.cuda()

        # Epoch for loop
        for epoch in range(0, args.epochs):

            if epoch > args.epochs * args.schedule[
                    0] and epoch < args.epochs * args.schedule[1]:
                lr -= lr_step
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

            # train for one epoch
            train(train_loader, model, criterion, optimizer, epoch, evaluation)

            # evaluate on test set
            acc1 = validate(val_loader, model, criterion, evaluation)

            is_best = acc1 > best_acc1
            best_acc1 = max(acc1, best_acc1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                },
                is_best=is_best,
                directory=args.checkpoint_dir)

        # get the best checkpoint and test it with test set
        best_model_file = os.path.join(args.checkpoint_dir, 'model_best.pth')
        if not os.path.isdir(args.checkpoint_dir):
            os.makedirs(args.checkpoint_dir)
        if os.path.isfile(best_model_file):
            print("=> loading best model '{}'".format(best_model_file))
            checkpoint = torch.load(best_model_file)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            if args.cuda:
                model.cuda()
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded best model '{}' (epoch {})".format(
                best_model_file, checkpoint['epoch']))
        else:
            print("=> no best model found at '{}'".format(best_model_file))

        # For testing
        acc = validate(test_loader, model, criterion, evaluation)
        print("Accuracy at iteration " + str(it) + ": " + str(acc))
        accs.append(acc)
    print("Average accuracy: ", np.mean(accs))
    print("std: ", np.std(accs))
示例#3
0
# if data_file in unlabeled_data_files:
use_node_labels = False
from graph_kernels import sp_kernel, wl_kernel
# else:
#     use_node_labels = True
#     from graph_kernels_labeled import sp_kernel, wl_kernel

# Choose kernels
kernels=[wl_kernel]
num_kernels = len(kernels)
ds_name = sys.argv[1]
pct_data = float(sys.argv[2])
assert(-.01 < pct_data < 1.01)
seed = 42
print("Computing feature maps...")
Q, subgraphs, labels,shapes = compute_nystrom(ds_name, pct_data, use_node_labels, dim, community_detection, kernels, seed)
print("Finished feature maps")
M=np.zeros((shapes[0],shapes[1],len(kernels)))
for idx,k in enumerate(kernels):
    M[:,:,idx]=Q[idx]

Q=M

# Binarize labels
le = LabelEncoder()
y = le.fit_transform(labels)
print("Building vocabulary")
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in subgraphs])
x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32)
for i in range(len(subgraphs)):
示例#4
0
    "IMDB-BINARY", "IMDB-MULTI", "REDDIT-BINARY", "REDDIT-MULTI-5K", "COLLAB",
    "SYNTHETIC"
]
if data_file in unlabeled_data_files:
    use_node_labels = False
    from graph_kernels import sp_kernel, wl_kernel
else:
    use_node_labels = True
    from graph_kernels_labeled import sp_kernel, wl_kernel

# Choose kernels
kernels = [wl_kernel]
num_kernels = len(kernels)

print("Computing feature maps...")
Q, subgraphs, labels, shapes = compute_nystrom(data_file, use_node_labels, dim,
                                               community_detection, kernels)

M = np.zeros((shapes[0], shapes[1], len(kernels)))
for idx, k in enumerate(kernels):
    M[:, :, idx] = Q[idx]

Q = M

# Binarize labels
le = LabelEncoder()
y = le.fit_transform(labels)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in subgraphs])
x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32)
for i in range(len(subgraphs)):