Пример #1
0
    def run_model(self):
        np.random.seed(1)
        random.seed(1)
        #         feat_data, labels, adj_lists = load_cora()
        features = nn.Embedding(self.num_nodes, self.num_feats)
        features.weight = nn.Parameter(torch.FloatTensor(self.feat_data),
                                       requires_grad=False)
        print('Features weight initialized')
        # features.cuda()
        if self.if_cuda:
            features = features.cuda()

        agg1 = MeanAggregator(features, cuda=self.if_cuda)
        print('Agg 1 Initialized')
        enc1 = Encoder(features,
                       self.num_feats,
                       128,
                       self.adj_lists,
                       agg1,
                       gcn=True,
                       cuda=self.if_cuda)
        print('Encoder 1 Initialized')
        agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=self.if_cuda)
        print('Agg 2 Initialized')
        enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                       enc1.embed_dim,
                       64,
                       self.adj_lists,
                       agg2,
                       base_model=enc1,
                       gcn=True,
                       cuda=self.if_cuda)
        print('Encoder 2 Initialized')
        enc1.num_sample = 6
        enc2.num_sample = 4

        graphsage = SupervisedGraphSage(enc2)
        print('Model is Initialized')
        print('Model Weights : ')
        print(enc1.weight)
        print(enc2.weight)
        print('End')

        #    graphsage.cuda()

        train_dataset = Question_Ans(self.df,
                                     mode='train',
                                     umap=self.user_map,
                                     qmap=self.question_map)
        val_dataset = Question_Ans(self.df,
                                   mode='val',
                                   umap=self.user_map,
                                   qmap=self.question_map)
        print('Dataloader Class Called')
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset, batch_size=self.batch_size, shuffle=True)
        val_dataloader = torch.utils.data.DataLoader(
            val_dataset, batch_size=self.batch_size, shuffle=False)
        print('Dataloaded')
        #         rand_indices = np.random.permutation(num_nodes)
        #         test = rand_indices[:1000]
        #         val = rand_indices[1000:1500]
        #         train = list(rand_indices[1500:])

        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           graphsage.parameters()),
                                    lr=self.lr)
        times = []

        for epoch in range(self.num_epochs):
            phase = 'train'
            batch = 0
            #             print('Printing Num Samples')
            #             print('Enc2 : ', graphsage.enc.num_sample)
            #             print('Enc2 features : ', graphsage.enc.features)
            #             print('Hey')
            #         print('Enc1 : ', graphsage.enc..num_samples)
            running_loss = 0
            tk0 = tqdm(train_dataloader, total=int(len(train_dataloader)))
            confusion_matrix_train = [[0, 0], [0, 0]]
            for questions, users, ans in tk0:
                batch += 1
                #             batch_nodes = train[:256]
                #             random.shuffle(train)
                start_time = time.time()
                optimizer.zero_grad()
                if (self.if_cuda):
                    ans = ans.type(torch.cuda.FloatTensor)
                else:
                    ans = ans.type(torch.FloatTensor)
#                 print(questions,users)
                loss, preds = graphsage.loss(questions, users, ans)
                for i, x in enumerate(preds):
                    confusion_matrix_train[int(preds[i])][int(ans[i])] += 1
                metrics = get_metrics(confusion_matrix_train)
                loss.backward()
                optimizer.step()
                end_time = time.time()
                times.append(end_time - start_time)
                running_loss += loss.data
                tk0.set_postfix(loss=(running_loss /
                                      (batch * train_dataloader.batch_size)),
                                suffix=str(metrics))
                #                 tk0.set_postfix(suffix=str(metrics))
                if (batch % 1000 == 0):
                    print(confusion_matrix_train)

            val_losses = []
            batch = 0
            running_loss = 0
            confusion_matrix_val = [[0, 0], [0, 0]]
            tk1 = tqdm(val_dataloader, total=int(len(val_dataloader)))
            for questions, users, ans in tk1:
                batch += 1
                #             batch_nodes = train[:256]
                #             random.shuffle(train)
                start_time = time.time()
                optimizer.zero_grad()
                loss, preds = graphsage.loss(questions, users, ans)
                for i, x in enumerate(preds):
                    confusion_matrix_val[int(preds[i])][int(ans[i])] += 1
                metrics = get_metrics(confusion_matrix_val)
                val_losses.append(loss)
                #             loss.backward()
                #             optimizer.step()
                end_time = time.time()
                times.append(end_time - start_time)
                running_loss += loss.data
                tk1.set_postfix(loss=(running_loss /
                                      (batch * val_dataloader.batch_size)),
                                suffix=str(metrics))
                #                 tk1.set_postfix(suffix=str(metrics))
                if (batch % 1000 == 0):
                    print(confusion_matrix_val)

#         val_output = graphsage.l(val)
#         print("Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro"))
#         print("Average batch time:", np.mean(times))
        return val_losses, graphsage
Пример #2
0
def run_edgelist(
        name="chg-miner",
        edgelist_path="../data/chg-miner/chg-miner-graph.txt",
        label_path="../data/chg-miner/chg-miner-labels.txt",
        embedding_path="../poincare/embeddings/poincare_chg_miner_noburn.txt",  # used to initialize + for distances
        embedding_header=False):

    feat_data, labels, adj_lists, num_nodes = load_edgelist(
        name, edgelist_path, label_path, embedding_path, embedding_header)
    features = nn.Embedding(num_nodes, feat_data.shape[1])
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)

    # network
    node_ordering_embeddings = load_embeddings(embedding_path,
                                               embedding_header)

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features,
                   feat_data.shape[1],
                   128,
                   adj_lists,
                   agg1,
                   gcn=True,
                   cuda=False,
                   ordering_embeddings=None)
    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   128,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   gcn=True,
                   cuda=False,
                   ordering_embeddings=node_ordering_embeddings)

    # make sure we don't sample -- but change this later?
    enc1.num_sample = None
    enc2.num_sample = None

    graphsage = SupervisedGraphSage(max(labels)[0] + 1, enc2)
    rand_indices = np.random.permutation(num_nodes)
    test = rand_indices[:10]
    val = rand_indices[10:11]
    train = list(rand_indices[11:])

    # 1 for email
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       graphsage.parameters()),
                                lr=0.6)
    times = []
    # embeds = None
    for batch in range(1000):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(
            batch_nodes,
            Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        # embeds = graphsage.embed(batch_nodes).detach().numpy()
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time - start_time)
        print batch, loss.data[0]

    val_output = graphsage.forward(test)
    print "Test F1:", f1_score(labels[test],
                               val_output.data.numpy().argmax(axis=1),
                               average="macro")
    print "Test Accuracy:", accuracy_score(
        labels[test],
        val_output.data.numpy().argmax(axis=1))
    print "Average batch time:", np.mean(times)

    out = open('embeddings/' + 'graphsage_' + edgelist_path.split('/')[-1],
               'wb+')
    embeddings = graphsage.embed(np.arange(num_nodes)).detach().numpy()
    for i in range(0, embeddings.shape[0]):
        s = str(int(i)) + ' '
        s += ' '.join([str(x) for x in embeddings[i]])
        s += '\n'
        out.write(s)

    out.close()